diff --git a/.bazelrc b/.bazelrc
index 2bd923c26db74c..92ddbf13548c4e 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -56,6 +56,7 @@
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
+#     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
 #     rbe_win_py39: Windows Python 3.9 RBE config
 #
@@ -238,9 +239,12 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
-# TODO(angerson, perfinion): What does sm_ vs compute_ mean? How can users
-# select a good value for this? See go/tf-pip-cuda
-build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+# `compute_XY` enables PTX embedding in addition to SASS. PTX
+# is forward compatible beyond the current compute capability major
+# release while SASS is only forward compatible inside the current
+# major release. Example: sm_80 kernels can run on sm_89 GPUs but
+# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
+build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
@@ -250,7 +254,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -488,12 +492,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -502,7 +506,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -523,9 +527,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
  
 # ROCm
@@ -539,6 +543,35 @@ build:rbe_linux_rocm_base --platforms="@ubuntu20.04-gcc9_manylinux2014-rocm_conf
 build:rbe_linux_rocm_base --action_env=TF_ROCM_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-rocm_config_rocm"
 build:rbe_linux_rocm_py3.9 --config=rbe_linux_rocm_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-rocm_config_python3.9"
 
+build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
+build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
+build:rbe_linux_cuda_nvcc --config=tensorrt
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
+build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
+build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --config=rbe_linux
+build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
+build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
+# These you may need to change for your own GCP project.
+common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
+build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
+build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
+test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
 build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
@@ -593,8 +626,6 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
-# TODO(b/294367488) disable after 2.15 brancut
-test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
@@ -632,7 +663,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
new file mode 100644
index 00000000000000..bb39d60168e08d
--- /dev/null
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -0,0 +1,39 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+name: OSV-Scanner Scheduled Scan
+
+on:
+  schedule:
+    - cron: 0 4 * * 1
+
+permissions:
+  # Require writing security events to upload SARIF file to security tab
+  security-events: write
+  # Only need to read contents
+  contents: read
+
+jobs:
+  scan-scheduled:
+    uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable.yml@main"
+    with:
+      scan-args: |-
+        --lockfile=requirements.txt:./requirements_lock_3_9.txt
+        --lockfile=requirements.txt:./requirements_lock_3_10.txt
+        --lockfile=requirements.txt:./requirements_lock_3_11.txt
+        --lockfile=requirements.txt:./requirements_lock_3_12.txt
+        --lockfile=requirements.txt:./ci/official/containers/linux_arm64/devel.requirements.txt
+        --lockfile=requirements.txt:./ci/official/containers/linux_arm64/jax.requirements.txt
+        --lockfile=requirements.txt:./ci/official/containers/linux_arm64/devel.usertools/test.requirements.txt
\ No newline at end of file
diff --git a/.github/workflows/sigbuild-docker-branch.yml b/.github/workflows/sigbuild-docker-branch.yml
index 108fe471efa2db..9f842f9fb27c11 100644
--- a/.github/workflows/sigbuild-docker-branch.yml
+++ b/.github/workflows/sigbuild-docker-branch.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.9, python3.10, python3.11]
+        python-version: [python3.9, python3.10, python3.11, python3.12]
     steps:
       - name: Delete unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
diff --git a/.github/workflows/sigbuild-docker-presubmit.yml b/.github/workflows/sigbuild-docker-presubmit.yml
index c61e65e7d834c0..03ae6f1dadf63f 100644
--- a/.github/workflows/sigbuild-docker-presubmit.yml
+++ b/.github/workflows/sigbuild-docker-presubmit.yml
@@ -32,7 +32,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.9, python3.10, python3.11]
+        python-version: [python3.9, python3.10, python3.11, python3.12]
     permissions:
       contents: read
       pull-requests: write
@@ -87,6 +87,7 @@ jobs:
           message: |
             I pushed these containers:
             
+            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.12`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.11`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.10`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.9`
diff --git a/.github/workflows/sigbuild-docker.yml b/.github/workflows/sigbuild-docker.yml
index ce9b99c494fc5e..5549f2995ac80f 100644
--- a/.github/workflows/sigbuild-docker.yml
+++ b/.github/workflows/sigbuild-docker.yml
@@ -37,7 +37,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.9, python3.10, python3.11]
+        python-version: [python3.9, python3.10, python3.11, python3.12]
     steps:
       - name: Delete unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index ca22041782fa4f..1b421effec8198 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -105,6 +105,18 @@ jobs:
         map sigbuild-r2.14-clang-python3.9 2.14-python3.9
         map sigbuild-r2.14-clang-python3.10 2.14-python3.10
         map sigbuild-r2.14-clang-python3.11 2.14-python3.11
+        # TF 2.16
+        map sigbuild-r2.16 2.16-python3.9
+        map sigbuild-r2.16-python3.9 2.16-python3.9
+        map sigbuild-r2.16-python3.10 2.16-python3.10
+        map sigbuild-r2.16-python3.11 2.16-python3.11
+        map sigbuild-r2.16-python3.12 2.16-python3.12
+        # TF 2.16 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.16-clang 2.16-python3.9
+        map sigbuild-r2.16-clang-python3.9 2.16-python3.9
+        map sigbuild-r2.16-clang-python3.10 2.16-python3.10
+        map sigbuild-r2.16-clang-python3.11 2.16-python3.11
+        map sigbuild-r2.16-clang-python3.12 2.16-python3.12
     - name: Create Pull Request with changes
       uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
diff --git a/RELEASE.md b/RELEASE.md
index 632c3652af1eff..94c795442d8992 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -28,6 +28,8 @@
 
 * `tf.lite`
     * Added support for `stablehlo.gather`.
+    * Added support for `stablehlo.add`.
+    * Added support for `stablehlo.multiply`.
 
 ## Keras
 
diff --git a/ci/official/containers/linux_arm64/devel.packages.txt b/ci/official/containers/linux_arm64/devel.packages.txt
index a8a9cb442c8b0b..efbae80eefacee 100644
--- a/ci/official/containers/linux_arm64/devel.packages.txt
+++ b/ci/official/containers/linux_arm64/devel.packages.txt
@@ -3,6 +3,8 @@ autoconf
 automake
 build-essential
 ca-certificates
+# TODO(b/308399490) Remove CMake once dm-tree (Keras dependency) has 3.12 wheels
+cmake
 llvm-17
 clang-17
 clang-format-12
diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index 183b2048ce1a5f..eb7938c8b3449d 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -16,6 +16,7 @@ TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=
 TFCI_NVIDIA_SMI_ENABLE=
 TFCI_OUTPUT_DIR=build_output
 TFCI_LIBTPU_DOWNLOAD_ENABLE=0
+TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=0
 TFCI_LIBTPU_DOWNLOAD_URL=
 TFCI_UPLOAD_LIB_ENABLE=
 TFCI_UPLOAD_LIB_LATEST_ENABLE=
diff --git a/ci/official/envs/ci_nightly_uploads b/ci/official/envs/ci_nightly_uploads
index e35a712d034966..ca6671f5ea3c59 100644
--- a/ci/official/envs/ci_nightly_uploads
+++ b/ci/official/envs/ci_nightly_uploads
@@ -1,6 +1,8 @@
-TFCI_UPLOAD_LIB_ENABLE=0
-TFCI_UPLOAD_LIB_LATEST_ENABLE=
-TFCI_UPLOAD_WHL_GCS_ENABLE=
+TFCI_UPLOAD_LIB_ENABLE=1
+TFCI_UPLOAD_LIB_GCS_URI="gs://libtensorflow-nightly/$(date -I)"
+TFCI_UPLOAD_LIB_LATEST_ENABLE=1
+TFCI_UPLOAD_LIB_LATEST_GCS_URI="gs://libtensorflow-nightly/latest"
+TFCI_UPLOAD_WHL_GCS_ENABLE=0
 TFCI_UPLOAD_WHL_GCS_URI=
-TFCI_UPLOAD_WHL_PYPI_ARGS=
-TFCI_UPLOAD_WHL_PYPI_ENABLE=
+TFCI_UPLOAD_WHL_PYPI_ARGS=(--config-file="$KOKORO_KEYSTORE_DIR/73361_tensorflow_pypirc_using_global_api_token" --repository pypi-warehouse)
+TFCI_UPLOAD_WHL_PYPI_ENABLE=1
diff --git a/ci/official/envs/disable_all_uploads b/ci/official/envs/disable_all_uploads
index b09169d677fa8f..6559f80e7f7c12 100644
--- a/ci/official/envs/disable_all_uploads
+++ b/ci/official/envs/disable_all_uploads
@@ -1,7 +1,9 @@
-TFCI_UPLOAD_LIB_ENABLE=0
+TFCI_DOCKER_REBUILD_UPLOAD_ENABLE=0
+TFCI_UPLOAD_LIB_ENABLE=
+TFCI_UPLOAD_LIB_GCS_URI=
 TFCI_UPLOAD_LIB_LATEST_ENABLE=
-TFCI_DOCKER_REBUILD_UPLOAD_ENABLE=
+TFCI_UPLOAD_LIB_LATEST_GCS_URI=
 TFCI_UPLOAD_WHL_GCS_ENABLE=
 TFCI_UPLOAD_WHL_GCS_URI=
-TFCI_UPLOAD_WHL_PYPI_ENABLE=
 TFCI_UPLOAD_WHL_PYPI_ARGS=
+TFCI_UPLOAD_WHL_PYPI_ENABLE=
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py310 b/ci/official/envs/nightly_linux_arm64_cpu_py310
index 7f1b040dff89b5..5b7900c43423b2 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py310
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py310
@@ -1,5 +1,6 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable arm64 uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.10
 TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py311 b/ci/official/envs/nightly_linux_arm64_cpu_py311
index 7779d84f0493ca..6edb93ba0bdf73 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py311
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py311
@@ -1,5 +1,6 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable arm64 uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.11
 TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py312 b/ci/official/envs/nightly_linux_arm64_cpu_py312
new file mode 100644
index 00000000000000..dfe96fafb5568e
--- /dev/null
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py312
@@ -0,0 +1,10 @@
+source ci/official/envs/ci_default
+# Disable arm64 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
+TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
+TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py39 b/ci/official/envs/nightly_linux_arm64_cpu_py39
index 274f777423e132..e3b516111fdc85 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py39
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py39
@@ -1,5 +1,6 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable arm64 uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.9
 TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py312 b/ci/official/envs/nightly_linux_x86_cpu_py312
new file mode 100644
index 00000000000000..586fd92e5d703c
--- /dev/null
+++ b/ci/official/envs/nightly_linux_x86_cpu_py312
@@ -0,0 +1,10 @@
+source ci/official/envs/ci_default
+# Disable 3.12 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
+TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py312 b/ci/official/envs/nightly_linux_x86_cuda_py312
new file mode 100644
index 00000000000000..4767f6dbdd6483
--- /dev/null
+++ b/ci/official/envs/nightly_linux_x86_cuda_py312
@@ -0,0 +1,11 @@
+source ci/official/envs/ci_default
+# Disable 3.12 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
+TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
+TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
\ No newline at end of file
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py310 b/ci/official/envs/nightly_linux_x86_tpu_py310
index da77f5ab3668ae..4e8014120f3762 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py310
+++ b/ci/official/envs/nightly_linux_x86_tpu_py310
@@ -1,11 +1,11 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable tpu uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.10
 TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_ENABLE=1
-TFCI_LIBTPU_DOWNLOAD_URL=https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.8.0/libtpu.so
+TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py311 b/ci/official/envs/nightly_linux_x86_tpu_py311
index 8f95c5df576b18..e4ae8cccf4fd46 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py311
+++ b/ci/official/envs/nightly_linux_x86_tpu_py311
@@ -1,11 +1,11 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable tpu uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.11
 TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_ENABLE=1
-TFCI_LIBTPU_DOWNLOAD_URL=https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.8.0/libtpu.so
+TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py312 b/ci/official/envs/nightly_linux_x86_tpu_py312
new file mode 100644
index 00000000000000..54d96b16548a4a
--- /dev/null
+++ b/ci/official/envs/nightly_linux_x86_tpu_py312
@@ -0,0 +1,11 @@
+source ci/official/envs/ci_default
+# Disable tpu uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
+TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
+TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
+TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py39 b/ci/official/envs/nightly_linux_x86_tpu_py39
index b75e450cda1a0a..4adaa8b216fbba 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py39
+++ b/ci/official/envs/nightly_linux_x86_tpu_py39
@@ -1,11 +1,11 @@
 source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
+# Disable tpu uploads while being worked on
+source ci/official/envs/disable_all_uploads
 TFCI_PYTHON_VERSION=3.9
 TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
 TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_ENABLE=1
-TFCI_LIBTPU_DOWNLOAD_URL=https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.8.0/libtpu.so
+TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
diff --git a/ci/official/envs/nightly_macos_arm64_py312 b/ci/official/envs/nightly_macos_arm64_py312
new file mode 100644
index 00000000000000..21432f076f6283
--- /dev/null
+++ b/ci/official/envs/nightly_macos_arm64_py312
@@ -0,0 +1,9 @@
+source ci/official/envs/ci_default
+source ci/official/envs/disable_all_uploads
+TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
+TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_DOCKER_ENABLE=0
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_UPLOAD_WHL_GCS_ENABLE=1
diff --git a/ci/official/requirements_updater/README.md b/ci/official/requirements_updater/README.md
index 292cb072a367aa..ad2e350dec8dc6 100644
--- a/ci/official/requirements_updater/README.md
+++ b/ci/official/requirements_updater/README.md
@@ -1,75 +1,128 @@
-### Hermetic Python
+# Hermetic Python
 
-Hermetic Python allows us not to rely on system-installed python and
-system-installed python packages, instead we register our own python toolchain.
+Hermetic Python allows not to rely on system-installed Python, and
+system-installed Python packages. \
+Instead, an independent Python toolchain is registered, ensuring the right
+dependencies are always used. \
 See https://github.com/bazelbuild/rules_python/ for more details.
 
-#### Hermetic Python toolchain details
+### Specifying the Python version
 
-By default, Python 3.9 is used.
+Note: Only a number of minor Python versions are supported at any given time.
 
-To set your own version for hermetic Python toolchain, use `TF_PYTHON_VERSION`
-environment variable, e.g.
+By default, the lowest supported version is used.
+
+To set a different version, use the `TF_PYTHON_VERSION` environment variable,
+e.g.
 
 ```
-export TF_PYTHON_VERSION=3.10
+export TF_PYTHON_VERSION=3.11
 ```
 
-To set a version from argument line, add to your command
+To specify the version via a Bazel command argument, use the following:
 
 ```
---repo_env=TF_PYTHON_VERSION=3.10
+--repo_env=TF_PYTHON_VERSION=3.11
 ```
 
-### Requirements updater
-
-Requirements updater is a standalone tool intended to simplify process of
-updating requirements for multiple versions of Python.
+## Requirements updater
 
-#### How to update/add requirements
+Requirements updater is a standalone tool, intended to simplify process of
+updating requirements for multiple minor versions of Python.
 
-By default, the name of the input requirements file is `requirements.in`,
-but it can be set using the `REQUIREMENTS_FILE_NAME` variable, for example:
-```
-export REQUIREMENTS_FILE_NAME=`my_requirements.in`
-```
+It takes in a file with a set of dependencies, and produces a more detailed
+requirements file for each version, with hashes specified for each
+dependency required, as well as their sub-dependencies.
 
-To set a version from the argument line, add to your command
-```
---repo_env=REQUIREMENTS_FILE_NAME=`my_requirements.in`
-```
+### How to update/add requirements
 
-#### How to run the updater
+By default, the name of the base requirements file is `requirements.in`, but it
+can be set using the `REQUIREMENTS_FILE_NAME` variable. \
+For example:
 
 ```
-bash updater.sh
+export REQUIREMENTS_FILE_NAME=my_requirements.in
 ```
 
-### How to add a new Python version
-
-1) In the `WORKSPACE` file add a new version to `python_versions` argument of
-the `python_register_multi_toolchains` function.
-
-2) In `BUILD.bazel` file add a load statement for the new version, e.g.
+To specify the file via a Bazel command argument, use the following:
 
 ```
-load("@python//3.11:defs.bzl",
-     compile_pip_requirements_3_11 = "compile_pip_requirements")
+--repo_env=REQUIREMENTS_FILE_NAME=my_requirements.in
 ```
 
-Add a new entry for the loaded `compile_pip_requirements`, e.g.
+### How to run the updater
 
 ```
-compile_pip_requirements_3_11(
-    name = "requirements_3_11",
-    extra_args = ["--allow-unsafe"],
-    requirements_in = "requirements.in",
-    requirements_txt = "requirements_lock_3_11.txt",
-)
+bash updater.sh
 ```
 
-3) Add the version to `SUPPORTED_VERSIONS` in `updater.sh`, after that run the
- requirements updater tool.
-
-4) As a result, a new `requirements_lock_3_11.txt` file should appear under the
-root of tensorflow directory.
+## How to add a new Python version
+
+Note: Updating the
+[rules-python](https://github.com/bazelbuild/rules_python/releases) version may
+be required before going through the steps below. This is due to the new Python
+versions becoming available through `rules-python`. \
+See
+[here](https://github.com/tensorflow/tensorflow/commit/f91457f258fdd78f693044a57efa63a38335d1de),
+and
+[here](https://github.com/tensorflow/tensorflow/commit/052445e04ce20fd747657e0198a1bcec2b6dff5b),
+for an example.
+
+See
+[this commit](https://github.com/tensorflow/tensorflow/commit/5f7f05a80aac9b01325a78ec3fcff0dbedb1cc23)
+as a rough example of the steps below.
+
+All the files referenced below are located in the same directory as this README,
+unless indicated otherwise.
+
+1) Add the new version to the `VERSIONS` variable inside
+   `tensorflow/tools/toolchains/python/python_repo.bzl`. \
+   While this isn't necessary for running the updater, it is required for
+   actually using the new version with Tensorflow.
+
+2) In the `WORKSPACE` file, add the new version to the `python_versions`
+   parameter of the `python_register_multi_toolchains` function.
+
+3) In the `BUILD.bazel` file, add a load statement for the new version, e.g.
+
+   ```
+      load("@python//3.11:defs.bzl",
+           compile_pip_requirements_3_11 = "compile_pip_requirements")
+   ```
+
+   Add a new entry for the loaded `compile_pip_requirements`, e.g.
+
+   ```
+      compile_pip_requirements_3_11(
+          name = "requirements_3_11",
+          extra_args = ["--allow-unsafe"],
+          requirements_in = "requirements.in",
+          requirements_txt = "requirements_lock_3_11.txt",
+      )
+   ```
+
+   ```
+      compile_pip_requirements_3_11(
+          name = "requirements_3_11_release",
+          extra_args = [
+              "--allow-unsafe",
+              "-P keras-nightly",
+              "-P tb-nightly",
+              "-P tf-estimator-nightly",
+          ],
+          requirements_in = "requirements.in",
+          requirements_txt = "requirements_lock_3_11.txt",
+      )
+   ```
+
+4) Add the version to `SUPPORTED_VERSIONS` in `updater.sh`, and
+   `release_updater.sh`
+
+5) Run the `updater.sh` shell script. \
+   If the base requirements file hasn't yet been updated to account for the new
+   Python version, which will require different versions for at least some
+   dependencies, it will need to be updated now, for the script to run
+   successfully.
+
+6) A new `requirements_lock_3_11.txt` file should appear under the root of the
+   `tensorflow` directory.
diff --git a/ci/official/utilities/rename_and_verify_wheels.sh b/ci/official/utilities/rename_and_verify_wheels.sh
index cd02b829194a98..4388329ae6edd7 100755
--- a/ci/official/utilities/rename_and_verify_wheels.sh
+++ b/ci/official/utilities/rename_and_verify_wheels.sh
@@ -20,14 +20,14 @@
 set -euxo pipefail
 
 DIR=$1
-find $DIR -iname "*.whl" | while read wheel; do
+find "$DIR" -iname "*.whl" | while read wheel; do
   echo "Checking and renaming $wheel..."
   wheel=$(realpath "$wheel")
   # Repair wheel based upon name/architecture, fallback to x86
   if [[ $wheel == *"aarch64.whl" ]]; then
-    time python3 -m auditwheel repair --plat manylinux2014_aarch64 "$wheel" --wheel-dir build 2>&1 | tee check.txt
+    time python3 -m auditwheel repair --plat manylinux2014_aarch64 "$wheel" --wheel-dir "$DIR" 2>&1 | tee check.txt
   else
-    time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir build 2>&1 | tee check.txt
+    time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir "$DIR" 2>&1 | tee check.txt
   fi
 
   # We don't need the original wheel if it was renamed
@@ -38,5 +38,5 @@ find $DIR -iname "*.whl" | while read wheel; do
   fi
   rm check.txt
 
-  TF_WHEEL="$wheel" bats ./ci/official/utilities/wheel_verification.bats --timing
+  TF_WHEEL="$wheel" BUILD_DIR="$DIR" bats ./ci/official/utilities/wheel_verification.bats --timing
 done
diff --git a/ci/official/utilities/setup.sh b/ci/official/utilities/setup.sh
index 2d1e3300da728c..fc352671274d99 100755
--- a/ci/official/utilities/setup.sh
+++ b/ci/official/utilities/setup.sh
@@ -57,6 +57,16 @@ else
   echo 'If you have not, you will see a lot of undefined variable errors.'
 fi
 
+# Force-disable uploads if the job initiator is not Kokoro
+# This is temporary: it's currently standard practice for employees to
+# run nightly jobs for testing purposes. We're aiming to move away from
+# this with more convenient methods, but as long as it's possible to do,
+# we want to make sure those extra jobs don't upload anything.
+# TODO(angerson) Remove this once it's no longer relevant
+if [[ "${KOKORO_BUILD_INITIATOR:-}" != "kokoro" ]]; then
+  source ./ci/official/envs/disable_all_uploads
+fi
+
 # Create and expand to the full path of TFCI_OUTPUT_DIR
 export TFCI_OUTPUT_DIR=$(realpath "$TFCI_OUTPUT_DIR")
 mkdir -p "$TFCI_OUTPUT_DIR"
diff --git a/ci/official/utilities/wheel_verification.bats b/ci/official/utilities/wheel_verification.bats
index e63afc43ac2b1f..99d0f32e35162e 100644
--- a/ci/official/utilities/wheel_verification.bats
+++ b/ci/official/utilities/wheel_verification.bats
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Suite of verification tests for the SINGLE TensorFlow wheel in the "build"
-# directory, or whatever path is set as $TF_WHEEL.
+# Suite of verification tests for the SINGLE TensorFlow wheel in the
+# $BUILD_DIR directory, or whatever path is set as $TF_WHEEL.
 
 setup_file() {
-    cd build
+    cd "$BUILD_DIR"
     if [[ -z "$TF_WHEEL" ]]; then
-        export TF_WHEEL=$(find build -iname "*.whl")
+        export TF_WHEEL=$(find "$BUILD_DIR" -iname "*.whl")
     fi
 
     # Setup the env for the python import testing
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index e3e569f4c112c2..20c6f2637d7e12 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -29,6 +29,14 @@ fi
 if [[ "$TFCI_LIBTPU_DOWNLOAD_ENABLE" == 1 ]]; then
   wget -P ./tensorflow/lib/ "$TFCI_LIBTPU_DOWNLOAD_URL"
 fi
+if [[ "$TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE" == 1 ]]; then
+  # For nightly jobs, libtpu.so comes from the latest nightly libtpu build.
+  # Note: expects a working wheel for today
+  DATE=$(TZ='America/Los_Angeles' date '+%Y%m%d')
+  tfrun wget "https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-0.1.dev${DATE}-py3-none-any.whl" -O libtpu.whl
+  # -j to discard intermediate directories; -o to overwrite if exists; -d to set output dir
+  tfrun unzip libtpu.whl libtpu/libtpu.so -j -o -d ./tensorflow/lib
+fi
 
 tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" //tensorflow/tools/pip_package:build_pip_package
 tfrun ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$TFCI_OUTPUT_DIR" "${TFCI_BUILD_PIP_PACKAGE_ARGS[@]}"
diff --git a/configure.py b/configure.py
index cbb74beb00c22f..c1cb20162012f6 100644
--- a/configure.py
+++ b/configure.py
@@ -878,11 +878,12 @@ def retrieve_clang_version(clang_executable):
 
 # Disable clang extension that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Still required for clang-17.
 # Can be removed once upb is updated, since a type definition is used within
 # offset of in the current version of ubp. See
 # https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-def disable_clang16_offsetof_extension(clang_version):
-  if int(clang_version.split('.')[0]) == 16:
+def disable_clang_offsetof_extension(clang_version):
+  if int(clang_version.split('.')[0]) in (16, 17):
     write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
 
 
@@ -1399,7 +1400,7 @@ def main():
       # Set up which clang we should use as the cuda / host compiler.
       clang_cuda_compiler_path = set_clang_cuda_compiler_path(environ_cp)
       clang_version = retrieve_clang_version(clang_cuda_compiler_path)
-      disable_clang16_offsetof_extension(clang_version)
+      disable_clang_offsetof_extension(clang_version)
     else:
       # Set up which gcc nvcc should use as the host compiler
       # No need to set this on Windows
@@ -1413,7 +1414,7 @@ def main():
       if environ_cp.get('TF_NEED_CLANG') == '1':
         clang_compiler_path = set_clang_compiler_path(environ_cp)
         clang_version = retrieve_clang_version(clang_compiler_path)
-        disable_clang16_offsetof_extension(clang_version)
+        disable_clang_offsetof_extension(clang_version)
 
   # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 18ed141388c020..ef01b603800a71 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1056,6 +1056,7 @@ package_group(
         "//third_party/py/envlogger/...",
         "//third_party/py/gldm/...",
         "//third_party/py/guesslang/...",
+        "//third_party/py/keras/...",
         "//third_party/py/tf_keras/...",
         "//third_party/yggdrasil_decision_forests/...",
         "//waymo/ml/cn/...",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index bf598c4d57c148..5490149bc905b1 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -1817,6 +1817,21 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults(
   return results;
 }
 
+TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResultsNoSerialization(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status) {
+  const GraphDef* graph_def_ptr =
+      reinterpret_cast<const GraphDef*>(graph_def->data);
+  auto results = new TF_ImportGraphDefResults();
+  mutex_lock l(graph->mu);
+  GraphImportGraphDefLocked(graph, *graph_def_ptr, options, results, status);
+  if (!status->status.ok()) {
+    delete results;
+    return nullptr;
+  }
+  return results;
+}
+
 void TF_GraphImportGraphDefWithReturnOutputs(
     TF_Graph* graph, const TF_Buffer* graph_def,
     const TF_ImportGraphDefOptions* options, TF_Output* return_outputs,
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 2f4cf6062de04c..9812b0a7dfcef3 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -834,6 +834,14 @@ TF_GraphImportGraphDefWithResults(TF_Graph* graph, const TF_Buffer* graph_def,
                                   const TF_ImportGraphDefOptions* options,
                                   TF_Status* status);
 
+// Has the same behavior as TF_GraphImportGraphDefWithResults, but instead of
+// taking in a serialized tensorflow::GraphDef, it takes in a *pointer* to the
+// C++ *in memory representation* of the GraphDef, stored in `graph_def->data`
+TF_CAPI_EXPORT extern TF_ImportGraphDefResults*
+TF_GraphImportGraphDefWithResultsNoSerialization(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status);
+
 // Import the graph serialized in `graph_def` into `graph`.
 // Convenience function for when only return outputs are needed.
 //
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 008e2d772a31e7..e50221aadeebfb 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/tf_buffer.h"
 #include "tensorflow/c/tf_buffer_internal.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
@@ -764,8 +765,15 @@ TEST(CAPI, ImportGraphDef) {
   EXPECT_EQ(2, TF_ImportGraphDefOptionsNumReturnOutputs(opts));
   TF_ImportGraphDefOptionsAddReturnOperation(opts, "scalar");
   EXPECT_EQ(1, TF_ImportGraphDefOptionsNumReturnOperations(opts));
+  tensorflow::GraphDef graph_def_proto;
+  ASSERT_TRUE(tensorflow::ParseProtoUnlimited(&graph_def_proto, graph_def->data,
+                                              graph_def->length));
+  TF_Buffer graph_def_buffer;
+  graph_def_buffer.data = reinterpret_cast<const void*>(&graph_def_proto);
+  graph_def_buffer.length = sizeof(tensorflow::GraphDef*);
   TF_ImportGraphDefResults* results =
-      TF_GraphImportGraphDefWithResults(graph, graph_def, opts, s);
+      TF_GraphImportGraphDefWithResultsNoSerialization(graph, &graph_def_buffer,
+                                                       opts, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
 
   TF_Operation* scalar2 = TF_GraphOperationByName(graph, "imported2/scalar");
@@ -956,8 +964,16 @@ TEST(CAPI, ImportGraphDef_MissingUnusedInputMappings) {
   TF_ImportGraphDefOptionsSetPrefix(opts, "imported");
   TF_ImportGraphDefOptionsAddInputMapping(opts, "scalar", 0, {scalar, 0});
   TF_ImportGraphDefOptionsAddInputMapping(opts, "fake", 0, {scalar, 0});
+
+  tensorflow::GraphDef graph_def_proto;
+  ASSERT_TRUE(tensorflow::ParseProtoUnlimited(&graph_def_proto, graph_def->data,
+                                              graph_def->length));
+  TF_Buffer graph_def_buffer;
+  graph_def_buffer.data = reinterpret_cast<const void*>(&graph_def_proto);
+  graph_def_buffer.length = sizeof(tensorflow::GraphDef*);
   TF_ImportGraphDefResults* results =
-      TF_GraphImportGraphDefWithResults(graph, graph_def, opts, s);
+      TF_GraphImportGraphDefWithResultsNoSerialization(graph, &graph_def_buffer,
+                                                       opts, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
 
   // Check unused input mappings
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index c5cd8d15c3642a..5c7bbddc3af6f2 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -57,6 +57,7 @@ cc_library(
     hdrs = ["tensor_pjrt_buffer_util.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/jit:pjrt_tensor_buffer_util",
         "//tensorflow/core:framework",
         "//tensorflow/core/tfrt/common:async_value_tensor",
         "//tensorflow/core/tfrt/common:global_state",
@@ -64,9 +65,11 @@ cc_library(
         "//tensorflow/core/tfrt/common:pjrt_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
+        "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
     ],
 )
@@ -80,14 +83,18 @@ tf_cc_test(
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/tfrt/common:async_value_tensor",
         "//tensorflow/core/tfrt/common:pjrt_util",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla/pjrt:pjrt_api",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_cpu",
+        "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_wrapper_impl",
     ],
 )
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
index 02f9388aa247b3..18a851e394aea7 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
@@ -15,11 +15,15 @@ limitations under the License.
 #include "tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h"
 
 #include <memory>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/pjrt_tensor_buffer_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
@@ -50,14 +54,16 @@ absl::StatusOr<PJRT_Buffer*> GetPjRtCBufferFromTensor(const Tensor* tensor) {
 absl::Status SetPjRtCBufferToTensor(PJRT_Buffer* c_buffer,
                                     xla::PjRtCApiClient* c_api_client,
                                     Tensor* tensor) {
+  auto buffer = std::make_unique<xla::PjRtCApiBuffer>(c_api_client, c_buffer);
   tensorflow::AsyncValueTensor* av_tensor =
       tensorflow::AsyncValueTensor::FromTensor(tensor);
   if (av_tensor == nullptr) {
-    return absl::InternalError(
-        "The tensor to set PjRtBuffer is not an AsyncValueTensor.");
+    TF_ASSIGN_OR_RETURN(
+        *tensor, MakeTensorFromPjRtBuffer(tensor->dtype(), tensor->shape(),
+                                          std::move(buffer)));
+  } else {
+    av_tensor->SetBuffer(std::move(buffer));
   }
-  av_tensor->SetBuffer(
-      std::make_unique<xla::PjRtCApiBuffer>(c_api_client, c_buffer));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
index 06fbd7b69d293d..c72f0cfafa6ead 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
@@ -14,20 +14,27 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h"
 
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/protobuf/error_codes.pb.h"
 
@@ -38,6 +45,27 @@ using ::testing::HasSubstr;
 using ::testing::NotNull;
 using ::tsl::testing::StatusIs;
 
+PJRT_Buffer* CreateCBuffer() {
+  auto status = pjrt::PjrtApi(DEVICE_CPU);
+  if (!status.ok()) {
+    CHECK_OK(pjrt::SetPjrtApi(DEVICE_CPU, GetPjrtApi()));
+  }
+  auto pjrt_client = xla::GetCApiClient(DEVICE_CPU);
+  CHECK_OK(pjrt_client.status());
+  auto c_api_client = down_cast<xla::PjRtCApiClient*>(pjrt_client->get());
+  std::vector<int32_t> data(1, 0);
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::S32, {1});
+
+  auto buffer = c_api_client->pjrt_c_client()->client->BufferFromHostBuffer(
+      data.data(), shape.element_type(), shape.dimensions(),
+      /*byte_strides=*/std::nullopt,
+      xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+      c_api_client->pjrt_c_client()->client->addressable_devices()[0]);
+  CHECK_OK(buffer.status());
+
+  return new PJRT_Buffer{std::move(*buffer), c_api_client->pjrt_c_client()};
+}
+
 TEST(TensorPjRtBufferUtilTest, GetPjRtCBufferFromTensorNoBuffer) {
   auto allocator = std::make_unique<AsyncValueAllocator>();
   tensorflow::Tensor tensor(allocator.get(), DT_FLOAT, {1});
@@ -103,36 +131,18 @@ TEST(TensorPjRtBufferUtilTest, GetPjRtCBufferFromTensorSuccess) {
 
 TEST(TensorPjRtBufferUtilTest, SetPjRtCBufferToTensorNotAsyncValueTensor) {
   tensorflow::Tensor tensor(DT_FLOAT, {1});
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, xla::GetCApiClient(DEVICE_CPU));
+  PJRT_Buffer* c_buffer = CreateCBuffer();
 
-  EXPECT_THAT(
-      SetPjRtCBufferToTensor(nullptr, nullptr, &tensor),
-      StatusIs(
-          error::INTERNAL,
-          HasSubstr(absl::StrCat(
-              "The tensor to set PjRtBuffer is not an AsyncValueTensor"))));
+  TF_EXPECT_OK(SetPjRtCBufferToTensor(
+      c_buffer, down_cast<xla::PjRtCApiClient*>(pjrt_client.get()), &tensor));
 }
 
 TEST(TensorPjRtBufferUtilTest, SetPjRtCBufferToTensorSuccess) {
   auto allocator = std::make_unique<AsyncValueAllocator>();
-  auto status = pjrt::PjrtApi(DEVICE_CPU);
-  if (!status.ok()) {
-    TF_ASSERT_OK(pjrt::SetPjrtApi(DEVICE_CPU, GetPjrtApi()));
-  }
-  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, xla::GetCApiClient(DEVICE_CPU));
-  auto c_api_client = down_cast<xla::PjRtCApiClient*>(pjrt_client.get());
-  std::vector<int32_t> data(1, 0);
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::S32, {1});
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer,
-      c_api_client->pjrt_c_client()->client->BufferFromHostBuffer(
-          data.data(), shape.element_type(), shape.dimensions(),
-          /*byte_strides=*/std::nullopt,
-          xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
-          nullptr,
-          c_api_client->pjrt_c_client()->client->addressable_devices()[0]));
   tensorflow::Tensor tensor(allocator.get(), DT_FLOAT, {1});
-  auto c_buffer =
-      new PJRT_Buffer{std::move(buffer), c_api_client->pjrt_c_client()};
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, xla::GetCApiClient(DEVICE_CPU));
+  PJRT_Buffer* c_buffer = CreateCBuffer();
 
   TF_EXPECT_OK(SetPjRtCBufferToTensor(
       c_buffer, down_cast<xla::PjRtCApiClient*>(pjrt_client.get()), &tensor));
diff --git a/tensorflow/cc/framework/cc_op_gen_util.cc b/tensorflow/cc/framework/cc_op_gen_util.cc
index d0c65d10b2184d..0a64525f477c8a 100644
--- a/tensorflow/cc/framework/cc_op_gen_util.cc
+++ b/tensorflow/cc/framework/cc_op_gen_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
 
+#include <cmath>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/platform/statusor.h"
 
@@ -206,7 +208,12 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) {
       return strings::StrCat(attr_value.i());
     case AttrValue::kF: {
       const float f = attr_value.f();
-      return strings::StrCat(attr_value.f(), floorf(f) == f ? ".0" : "", "f");
+      if (std::isinf(f)) {
+        return strings::StrCat(f < 0.0f ? "-" : "+",
+                               "std::numeric_limits<float>::infinity()");
+      } else {
+        return strings::StrCat(attr_value.f(), floorf(f) == f ? ".0" : "", "f");
+      }
     }
     case AttrValue::kB:
       return attr_value.b() ? "true" : "false";
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index b09ebe0bd0944c..935b37b37aa5c0 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -522,6 +522,7 @@ cc_library(
     visibility = [
         "//learning/brain/contrib/hub/server/distro:__subpackages__",
         "//learning/brain/contrib/tpu_modeling:__subpackages__",
+        "//learning/metadata/artifactoid/cc:__subpackages__",
         "//learning/tfx/pipeline/util:__subpackages__",
         "//tensorflow/python/saved_model:__subpackages__",
     ],
diff --git a/tensorflow/compiler/jit/device_compiler_client.cc b/tensorflow/compiler/jit/device_compiler_client.cc
index 747aa7de090125..5d0042c3a4ed76 100644
--- a/tensorflow/compiler/jit/device_compiler_client.cc
+++ b/tensorflow/compiler/jit/device_compiler_client.cc
@@ -37,8 +37,6 @@ xla::ExecutableBuildOptions GetExecutableBuildOptions(
   build_options.set_alias_passthrough_params(options.alias_passthrough_params);
   build_options.mutable_debug_options()->set_xla_detailed_logging(
       options.detailed_logging);
-  build_options.mutable_debug_options()->set_xla_enable_dumping(
-      options.detailed_logging);
   if (tensorflow::OpDeterminismRequired()) {
     build_options.mutable_debug_options()->set_xla_gpu_deterministic_ops(true);
   }
diff --git a/tensorflow/compiler/jit/device_compiler_client_test.cc b/tensorflow/compiler/jit/device_compiler_client_test.cc
index 4ac2e7f1a205d1..f42ae36f50735b 100644
--- a/tensorflow/compiler/jit/device_compiler_client_test.cc
+++ b/tensorflow/compiler/jit/device_compiler_client_test.cc
@@ -60,5 +60,17 @@ TEST(GetExecutableOptionTest, DeviceOrdinalNotSet) {
   EXPECT_EQ(build_option.device_ordinal(), -1);
 }
 
+TEST(GetExecutableOptionTest, DumpingWithoutDetailedLogging) {
+  XlaCompiler::Options options;
+  options.detailed_logging = false;
+  XlaCompiler::CompilationResult result;
+
+  auto build_option =
+      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
+
+  EXPECT_FALSE(build_option.debug_options().xla_detailed_logging());
+  EXPECT_TRUE(build_option.debug_options().xla_enable_dumping());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/get_compiler_ir.cc b/tensorflow/compiler/jit/get_compiler_ir.cc
index 37987cb55ecf38..2614d9909bbd80 100644
--- a/tensorflow/compiler/jit/get_compiler_ir.cc
+++ b/tensorflow/compiler/jit/get_compiler_ir.cc
@@ -74,8 +74,6 @@ static StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildExecutable(
   build_options.set_alias_passthrough_params(options.alias_passthrough_params);
   build_options.mutable_debug_options()->set_xla_detailed_logging(
       options.detailed_logging);
-  build_options.mutable_debug_options()->set_xla_enable_dumping(
-      options.detailed_logging);
   // If the embed_ir_in_executable is set, hlo_proto will be dumped in
   // executable. The hlo_proto contains HLO modules and buffer assignment.
   build_options.mutable_debug_options()->set_xla_embed_ir_in_executable(
@@ -148,7 +146,7 @@ static StatusOr<std::vector<XlaCompiler::Argument>>
 BuildXlaCompilerArgumentFromTensorSpec(
     const FunctionBody* fbody, absl::Span<int const> must_be_constant_idxs,
     absl::Span<const Tensor* const> inputs,
-    absl::Span<VariableInfo const> variable_args, Device* device,
+    absl::Span<VariableInfo const> variable_args,
     absl::Span<const ArgShapeAndDType> flat_arg_shape_and_dtype) {
   TF_RET_CHECK(fbody != nullptr);
   auto& input_args = fbody->fdef.signature().input_arg();
@@ -326,7 +324,7 @@ StatusOr<std::string> GetCompilerIr(
 
   if (compiler_arg_source == CompilerArgSource::TENSOR_SPEC) {
     args = BuildXlaCompilerArgumentFromTensorSpec(fbody, constant_arg_indices,
-                                                  inputs, variable_infos, dev,
+                                                  inputs, variable_infos,
                                                   input_arg_shape_and_dtype);
   } else if (compiler_arg_source == CompilerArgSource::CONCRETE_INPUT) {
     args = XlaComputationLaunchContext::BuildXlaCompilerArguments(
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 28e2753c68649c..0bde3279c1d7f6 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -61,6 +61,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tosa:tf_passes",
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 87de9412f32756..8117705b0fac2b 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -12,6 +12,7 @@ package(
         "//learning/brain/mobile/programmability:__subpackages__",
         "//tensorflow/lite/experimental/tf_runtime:__subpackages__",
         "//tensorflow/lite/testing:__subpackages__",
+        "//third_party/odml/infra/genai/conversion/per_layer:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -873,8 +874,8 @@ cc_library(
         "transforms/lift_tflite_flex_ops.h",
     ],
     deps = [
+        ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_attr",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
@@ -883,6 +884,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
         "@flatbuffers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
index b53508f01efa2d..cbd2dd6618f93d 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
@@ -153,8 +153,8 @@ func.func @padSliceTo4D(%arg0: tensor<4x384x32xf32>) -> tensor<1x384x32xf32> {
 }
 
 // CHECK:       func @padSliceTo4D(%[[VAL_0:.*]]: tensor<4x384x32xf32>) -> tensor<1x384x32xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 1, 384, 32]> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 1, 384, 32]> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[1, 4, 384, 32]> : tensor<4xi32>
 // CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<[1, 384, 32]> : tensor<3xi32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_3]]) : (tensor<4x384x32xf32>, tensor<4xi32>) -> tensor<1x4x384x32xf32>
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 7070f5795ca3a0..be4aae3d95db08 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -675,6 +675,11 @@ class Translator {
   std::optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
   CreateMetadataVector();
 
+  // Encodes the `tfl.metadata_buffer` array attribute of the module to the
+  // metadata_buffer section in the final model. Returns empty if there isn't
+  // such attribute in the mlir module.
+  VectorBufferOffset<int32_t> CreateMetadataBufferVector();
+
   // Builds and returns list of tfl.SignatureDef sections in the model.
   std::optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
   CreateSignatureDefs(const std::vector<SignatureDefData>& signature_defs);
@@ -1635,6 +1640,14 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::GatherOp>(inst)) {
       return BuildStablehloGatherOp(shlo_op, operands, results);
     }
+    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(inst)) {
+      return BuildStablehloOperatorwithoutOptions(
+          inst, operands, results, tflite::BuiltinOperator_STABLEHLO_ADD);
+    }
+    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::MulOp>(inst)) {
+      return BuildStablehloOperatorwithoutOptions(
+          inst, operands, results, tflite::BuiltinOperator_STABLEHLO_MULTIPLY);
+    }
     // for ops don't have kernels, only serialize when conversion is set to true
     if (convert_stablehlo_) {
       if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::LogisticOp>(inst)) {
@@ -1643,17 +1656,6 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             tflite::BuiltinOperator_STABLEHLO_LOGISTIC);
       }
 
-      if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(inst)) {
-        return BuildStablehloOperatorwithoutOptions(
-            inst, operands, results, tflite::BuiltinOperator_STABLEHLO_ADD);
-      }
-
-      if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::MulOp>(inst)) {
-        return BuildStablehloOperatorwithoutOptions(
-            inst, operands, results,
-            tflite::BuiltinOperator_STABLEHLO_MULTIPLY);
-      }
-
       if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::DivOp>(inst)) {
         return BuildStablehloOperatorwithoutOptions(
             inst, operands, results, tflite::BuiltinOperator_STABLEHLO_DIVIDE);
@@ -2608,6 +2610,18 @@ Translator::CreateMetadataVector() {
   return builder_.CreateVector(metadata);
 }
 
+VectorBufferOffset<int32_t> Translator::CreateMetadataBufferVector() {
+  auto array_attr =
+      module_->getAttrOfType<mlir::ArrayAttr>("tfl.metadata_buffer");
+  std::vector<int32_t> metadata_buffer;
+  if (!array_attr) return 0;
+  for (auto value : array_attr.getAsValueRange<mlir::IntegerAttr>()) {
+    metadata_buffer.push_back(value.getSExtValue());
+  }
+
+  return builder_.CreateVector(metadata_buffer);
+}
+
 // Helper method that returns list of all strings in a StringAttr identified
 // by 'attr_key' and values are separated by a comma.
 llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
@@ -3010,7 +3024,8 @@ std::optional<std::string> Translator::TranslateInternal() {
 
   // Build the model and finish the model building process.
   auto description = builder_.CreateString(model_description.data());
-  VectorBufferOffset<int32_t> metadata_buffer = 0;  // Deprecated
+  VectorBufferOffset<int32_t> metadata_buffer =
+      CreateMetadataBufferVector();  // Deprecated
   auto metadata = CreateMetadataVector();
   if (!metadata) return std::nullopt;
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 40943740ba1f00..6eb2aee99aacb9 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -1904,6 +1904,11 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
                     mlir::UnitAttr::get(builder.getContext()));
   }
 
+  if (!model->metadata_buffer.empty()) {
+    module->setAttr("tfl.metadata_buffer",
+                    builder.getI32ArrayAttr(model->metadata_buffer));
+  }
+
   if (use_stablehlo_constant) {
     module->setAttr("tfl.metadata",
                     builder.getDictionaryAttr(builder.getNamedAttr(
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
index 83d0a0e6cf1ccf..798473986f64fd 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
@@ -160,7 +160,7 @@ TEST(ErrorCollectorTest, TessFailurePass) {
   EXPECT_EQ(collected_errors.count(NewConverterErrorData(
                 "MockFailurePass",
                 "Failed at tf.Const op\nsee current operation: %0 = "
-                "\"tf.Const\"() {value = dense<1> : tensor<4xi32>} : () -> "
+                "\"tf.Const\"() <{value = dense<1> : tensor<4xi32>}> : () -> "
                 "tensor<4xi32>\nError code: ERROR_NEEDS_FLEX_OPS",
                 ConverterErrorData::ERROR_NEEDS_FLEX_OPS, "tf.Const",
                 mlir::FileLineColLoc::get(input_file_id, 2, 9))),
@@ -168,22 +168,23 @@ TEST(ErrorCollectorTest, TessFailurePass) {
   EXPECT_EQ(collected_errors.count(NewConverterErrorData(
                 "MockFailurePass",
                 "Failed at tf.Const op\nsee current operation: %1 = "
-                "\"tf.Const\"() {value = dense<0> : tensor<4xi32>} : () -> "
+                "\"tf.Const\"() <{value = dense<0> : tensor<4xi32>}> : () -> "
                 "tensor<4xi32>\nError code: ERROR_NEEDS_FLEX_OPS",
                 ConverterErrorData::ERROR_NEEDS_FLEX_OPS, "tf.Const",
                 mlir::FileLineColLoc::get(input_file_id, 2, 9))),
             1);
-  EXPECT_EQ(collected_errors.count(NewConverterErrorData(
-                "MockFailurePass",
-                "Failed at tf.StridedSlice op\nsee current operation: %2 = "
-                "\"tf.StridedSlice\"(%arg0, %1, %1, %0) {begin_mask = 11 : "
-                "i64, device = \"\", ellipsis_mask = 0 : i64, end_mask = 11 : "
-                "i64, new_axis_mask = 4 : i64, shrink_axis_mask = 0 : i64} : "
-                "(tensor<*xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) "
-                "-> tensor<*xf32>\nError code: ERROR_NEEDS_FLEX_OPS",
-                ConverterErrorData::ERROR_NEEDS_FLEX_OPS, "tf.StridedSlice",
-                mlir::FileLineColLoc::get(input_file_id, 4, 10))),
-            1);
+  EXPECT_EQ(
+      collected_errors.count(NewConverterErrorData(
+          "MockFailurePass",
+          "Failed at tf.StridedSlice op\nsee current operation: %2 = "
+          "\"tf.StridedSlice\"(%arg0, %1, %1, %0) <{begin_mask = 11 : "
+          "i64, ellipsis_mask = 0 : i64, end_mask = 11 : i64, new_axis_mask = "
+          "4 : i64, shrink_axis_mask = 0 : i64}> {device = \"\"} : "
+          "(tensor<*xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) "
+          "-> tensor<*xf32>\nError code: ERROR_NEEDS_FLEX_OPS",
+          ConverterErrorData::ERROR_NEEDS_FLEX_OPS, "tf.StridedSlice",
+          mlir::FileLineColLoc::get(input_file_id, 4, 10))),
+      1);
 
   // Check the location information.
   std::vector<std::string> locations;
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
index 589d438311ac50..9c6d9b8aa8059b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
@@ -4,7 +4,7 @@
 func.func @bias_add(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tensor<1x10x10x32xf32> {
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
   func.return %0 : tensor<1x10x10x32xf32>
-// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%arg0, %arg1) <{data_format = "NHWC"}> {T = "tfdtype$DT_FLOAT"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
 // CHECK: return %[[BIASADD_0]] : tensor<1x10x10x32xf32>
 }
 
@@ -30,8 +30,8 @@ func.func @conv2d_backprop_input_with_add(%arg0: tensor<4xi32>, %arg1: tensor<3x
   %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
   %2 = "tf.AddV2"(%0, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: %[[ADDV2_0:.*]] = "tf.AddV2"(%[[CONV2DBACKPROPINPUT_0]], %[[CONST_0]]) : (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[ADDV2_0]] : tensor<15x28x28x1xf32>
 }
@@ -42,8 +42,8 @@ func.func @conv2d_backprop_input_with_sub(%arg0: tensor<4xi32>, %arg1: tensor<3x
   %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
   %2 = "tf.Sub"(%0, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[CONV2DBACKPROPINPUT_0]], %[[CONST_0]]) : (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[SUB_0]] : tensor<15x28x28x1xf32>
 }
@@ -71,7 +71,7 @@ func.func @identity_with_const() -> tensor<*xf32> {
   %0 = "tf.Identity"(%cst) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
   %1 = "tf.AddV2"(%0, %cst_1) {device = ""} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<[2.177590e-01, 2.89503098]> : tensor<2xf32>} : () -> tensor<*xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<[2.177590e-01, 2.89503098]> : tensor<2xf32>}> : () -> tensor<*xf32>
 // CHECK: return %[[CONST_0]] : tensor<*xf32>
 }
 
@@ -80,7 +80,7 @@ func.func @identity(%arg0: tensor<2xf32>) -> tensor<*xf32> {
   %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
   %1 = "tf.AddV2"(%0, %cst_1) {device = ""} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {device = "", value = dense<1.000000e-03> : tensor<f32>} : () -> tensor<f32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e-03> : tensor<f32>}> {device = ""} : () -> tensor<f32>
 // CHECK: %[[IDENTITY_0:.*]] = "tf.Identity"(%arg0) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
 // CHECK: %[[ADDV2_0:.*]] = "tfl.custom"(%0, %cst) {custom_code = "FlexAddV2", custom_option = #tfl<const_bytes : "0x0541646456320016120541646456321A001A002A070A015412023001320000021F191414042801">} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
 // CHECK: return %[[ADDV2_0]] : tensor<*xf32>
@@ -95,7 +95,7 @@ func.func @bias_add_with_identity(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf
   %2 = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {strides = [1, 2, 2, 1], padding="SAME", dilations=[1, 1, 1, 1]}: (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
   %3 = "tf.AddV2"(%2, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[CONV2DBACKPROPINPUT_0]] : tensor<15x28x28x1xf32>
 }
 
@@ -108,10 +108,10 @@ func.func @conv_with_relu1_pattern1(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x4
   %1 = "tf.Maximum"(%0, %cst_0) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
   %2 = "tf.Minimum"(%1, %cst_1) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
   func.return %2 : tensor<1x3x4x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x3x2xf32>} : () -> tensor<1x1x3x2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x3x2xf32>}> : () -> tensor<1x1x3x2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[MAXIMUM_0:.*]] = "tf.Maximum"(%[[CONV2D_0]], %[[CONST_1]]) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[MINIMUM_0:.*]] = "tf.Minimum"(%[[MAXIMUM_0]], %[[CONST_2]]) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[MINIMUM_0]] : tensor<1x3x4x2xf32>
@@ -126,10 +126,10 @@ func.func @conv_with_relu1_pattern2(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x4
   %1 = "tf.Minimum"(%0, %cst_1) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
   %2 = "tf.Maximum"(%1, %cst_0) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
   func.return %2 : tensor<1x3x4x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x3x2xf32>} : () -> tensor<1x1x3x2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x3x2xf32>}> : () -> tensor<1x1x3x2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[MINIMUM_0:.*]] = "tf.Minimum"(%[[CONV2D_0]], %[[CONST_2]]) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[MAXIMUM_0:.*]] = "tf.Maximum"(%[[MINIMUM_0]], %[[CONST_1]]) : (tensor<1x3x4x2xf32>, tensor<f32>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[MAXIMUM_0]] : tensor<1x3x4x2xf32>
@@ -144,10 +144,10 @@ func.func @conv_with_relu1_invalid_pattern(%arg0: tensor<1x3x4x3xf32>) -> (tenso
   %1 = "tf.Minimum"(%0, %cst_1) : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
   %2 = "tf.Maximum"(%1, %cst_0) : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
   func.return %2 : tensor<1x3x4x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x3x2xf32>} : () -> tensor<1x1x3x2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<[-1.000000e+00, -3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x3x2xf32>}> : () -> tensor<1x1x3x2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<[-1.000000e+00, -3.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 3.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%[[CONV2D_0]], %[[CONST_2]]) {custom_code = "FlexMinimum", custom_option = #tfl<const_bytes : "0x074D696E696D756D001812074D696E696D756D1A001A002A070A01541202300132000002231B1414042801">} : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[CUSTOM_1:.*]] = "tfl.custom"(%[[CUSTOM_0]], %[[CONST_1]]) {custom_code = "FlexMaximum", custom_option = #tfl<const_bytes : "0x074D6178696D756D001812074D6178696D756D1A001A002A070A01541202300132000002231B1414042801">} : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CUSTOM_1]] : tensor<1x3x4x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
index dea1a9cc823f0b..5835d7d107cef5 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
@@ -4,7 +4,7 @@
 func.func @bias_add(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tensor<1x10x10x32xf32> {
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
   func.return %0 : tensor<1x10x10x32xf32>
-// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%arg0, %arg1) <{data_format = "NHWC"}> {T = "tfdtype$DT_FLOAT"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
 // CHECK: return %[[BIASADD_0]] : tensor<1x10x10x32xf32>
 }
 
@@ -30,8 +30,8 @@ func.func @conv2d_backprop_input_with_add(%arg0: tensor<4xi32>, %arg1: tensor<3x
   %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
   %2 = "tf.AddV2"(%0, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: %[[ADDV2_0:.*]] = "tf.AddV2"(%[[CONV2DBACKPROPINPUT_0]], %[[CONST_0]]) {no_fallback} : (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[ADDV2_0]] : tensor<15x28x28x1xf32>
 }
@@ -42,8 +42,8 @@ func.func @conv2d_backprop_input_with_sub(%arg0: tensor<4xi32>, %arg1: tensor<3x
   %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
   %2 = "tf.Sub"(%0, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[CONV2DBACKPROPINPUT_0]], %[[CONST_0]]) {no_fallback} : (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[SUB_0]] : tensor<15x28x28x1xf32>
 }
@@ -73,6 +73,6 @@ func.func @bias_add_with_identity(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf
   %2 = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {strides = [1, 2, 2, 1], padding="SAME", dilations=[1, 1, 1, 1]}: (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
   %3 = "tf.AddV2"(%2, %1): (tensor<15x28x28x1xf32>, tensor<1xf32>) -> tensor<15x28x28x1xf32>
   func.return %2 : tensor<15x28x28x1xf32>
-// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) {dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
+// CHECK: %[[CONV2DBACKPROPINPUT_0:.*]] = "tf.Conv2DBackpropInput"(%arg0, %arg1, %arg2) <{dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4xi32>, tensor<3x3x1x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
 // CHECK: return %[[CONV2DBACKPROPINPUT_0]] : tensor<15x28x28x1xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant.mlir
index 83f07de73cc5c7..dd93ae25948fef 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant.mlir
@@ -36,7 +36,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   %1 = "quantfork.qcast"(%0) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i8:f32, 1.000000e+00:-128>>
   func.return %1 : tensor<8x!quant.uniform<i8:f32, 1.000000e+00:-128>>
 
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 5 : i64}
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 5 : i64}>
 // CHECK:  %1 = "quantfork.qcast"(%0) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i8:f32, 1.000000e+00:-128>>
 // CHECK:  return %1
 }
@@ -51,7 +51,7 @@ func.func @fakeQuantFolded() -> (tensor<8xf32>) {
   %rst = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 5, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
   func.return %rst : tensor<8xf32>
 
-// CHECK: %[[CONSTANT:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>}
+// CHECK: %[[CONSTANT:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<8xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT]]) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i8:f32, 1.000000e+00:-128>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
@@ -79,7 +79,7 @@ func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf3
   %rst = "tf.Conv2D"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x8x7x16xf32>
   func.return %rst : tensor<256x8x7x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i8:f32, 1.000000e+00:-128>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[DEQUANTIZE]])
@@ -98,7 +98,7 @@ func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256
   %rst = "tf.Conv2D"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x8x7x16xf32>
   func.return %rst : tensor<256x8x7x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i8:f32:3,
 // CHECK-SAME: {1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,
 // CHECK-SAME: 1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128}>>
@@ -119,7 +119,7 @@ func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x
   %rst = "tf.DepthwiseConv2dNative"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %rst : tensor<256x30x30x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i8:f32, 1.000000e+00:-128>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[DEQUANTIZE]])
@@ -138,7 +138,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (t
   %rst = "tf.DepthwiseConv2dNative"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %rst : tensor<256x30x30x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i8:f32:3,
 // CHECK-SAME: {1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,
 // CHECK-SAME: 1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128,1.000000e+00:-128}>>
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant_4bit.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant_4bit.mlir
index 54c3de4bacd7a3..519226a7755b5b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant_4bit.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/tf_to_quant_4bit.mlir
@@ -36,7 +36,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   %1 = "quantfork.qcast"(%0) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i4:f32, 1.000000e+00:-8>>
   func.return %1 : tensor<8x!quant.uniform<i4:f32, 1.000000e+00:-8>>
 
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 3 : i64}
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 3 : i64}>
 // CHECK:  %1 = "quantfork.qcast"(%0) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i4:f32, 1.000000e+00:-8>>
 // CHECK:  return %1
 }
@@ -51,7 +51,7 @@ func.func @fakeQuantFolded() -> (tensor<8xf32>) {
   %rst = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
   func.return %rst : tensor<8xf32>
 
-// CHECK: %[[CONSTANT:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>}
+// CHECK: %[[CONSTANT:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<8xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT]]) : (tensor<8xf32>) -> tensor<8x!quant.uniform<i4:f32, 1.000000e+00:-8>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
@@ -79,7 +79,7 @@ func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf3
   %rst = "tf.Conv2D"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x8x7x16xf32>
   func.return %rst : tensor<256x8x7x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i4:f32, 1.000000e+00:-8>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[DEQUANTIZE]])
@@ -98,7 +98,7 @@ func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256
   %rst = "tf.Conv2D"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x8x7x16xf32>
   func.return %rst : tensor<256x8x7x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i4:f32:3,
 // CHECK-SAME: {1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,
 // CHECK-SAME: 1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8}>>
@@ -119,7 +119,7 @@ func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x
   %rst = "tf.DepthwiseConv2dNative"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %rst : tensor<256x30x30x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i4:f32, 1.000000e+00:-8>>
 // CHECK: %[[DEQUANTIZE:.*]] = "quantfork.dcast"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[DEQUANTIZE]])
@@ -138,7 +138,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (t
   %rst = "tf.DepthwiseConv2dNative"(%arg, %fq) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %rst : tensor<256x30x30x16xf32>
 
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}
+// CHECK: %[[CONSTANT0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x3x3x16xf32>}>
 // CHECK: %[[QUANTIZE:.*]] = "quantfork.qcast"(%[[CONSTANT0]]) : (tensor<3x3x3x16xf32>) -> tensor<3x3x3x16x!quant.uniform<i4:f32:3,
 // CHECK-SAME: {1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,
 // CHECK-SAME: 1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8,1.000000e+00:-8}>>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index ce3de9b523e54d..2459f3d214d13a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -193,6 +193,7 @@ cc_library(
         ":drop_savedmodel_semantics",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":rename_entrypoint_to_main",
         ":smuggle_disallowed_ops",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir
new file mode 100644
index 00000000000000..795840247cab93
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir
@@ -0,0 +1,26 @@
+//RUN: tf_tfl_translate --enable-stablehlo-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1660 : i32}} {
+  func.func @main(%arg0: tensor<2x3xi32>) -> tensor<2x3xi32> attributes {tf.entry_function = {control_outputs = "", inputs = "args_tf_0", outputs = "Identity"}} {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Identity"(%arg0) {device = ""} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.XlaSharding"(%outputs) {_XlaSharding = "", device = "", sharding = "", unspecified_dims = []} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      %outputs_2, %control_3 = tf_executor.island wraps "tf.XlaCallModule"(%outputs_0) {Sout = [#tf_type.shape<2x3>], device = "", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "ML\EFR\01StableHLO_v0.9.0\00\01\17\05\01\03\01\03\05\03\07\07\09\0B\03]?\0B\01)\07\0F\0B+\0B\0F\0B\0B\0B3\0B\0B\0B\0B\0F\0B\0F\0B\13\0B\03\17\0F\13\0B\0B\0B\0F\13\0B\0B\0B\0B\01\05\0B\0F\03\07\17\17\07\02\D7\1F\11\03\05\05\0D\03\09\09\0B\0D\03\0F\03\05\11\05\0F\11\01\00\05\11\05\13\05\15\03\0B\15)\171\193\05;\1B=\05\17\05\19\05\1B\05\1D\1D\1F\01\05\1F\1D#%\05!\17'\A9\01\05#\03\03+\0D\03-/\1D%\1D'#\07\03\035\0D\0379\1D)\1D+\1D-\1D/\01\09\01\02\02)\05\09\0D\09\11\03\05\03\05\1B\04C\05\01\11\01\07\07\03\01\05\03\11\01\13\07\03\05\0B\03\05\1D\05\06!\03\05\05\01\01\07\04\01\03\03\06\03\01\05\01\00f\051\0F\0B\03!\1B\1D[;\05\1F\15\1D\15\1D%)9\13\15\19\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00multiply_v1\00return_v1\00sym_name\00jax.uses_shape_polymorphism\00mhlo.num_partitions\00mhlo.num_replicas\00jit_jax_model\00arg_attrs\00function_type\00res_attrs\00sym_visibility\00x\00jit(jax_model)/jit(main)/mul\00experimental/users/ypang/lite/convert_ulm.py\00mhlo.sharding\00{replicated}\00jax.result_info\00\00main\00public\00", platforms = ["CPU"], version = 8 : i64} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      %control_4 = tf_executor.island(%control_3) wraps "tf.NoOp"() {device = ""} : () -> ()
+      %outputs_5, %control_6 = tf_executor.island wraps "tf.PreventGradient"(%outputs_2) {device = "", message = "The jax2tf-converted function does not support gradients. Use `with_gradient` parameter to enable gradients"} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      %outputs_7, %control_8 = tf_executor.island wraps "tf.Identity"(%outputs_5) {device = ""} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      %outputs_9, %control_10 = tf_executor.island(%control_4) wraps "tf.Identity"(%outputs_7) {device = ""} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+      tf_executor.fetch %outputs_9 : tensor<2x3xi32>
+    }
+    return %0 : tensor<2x3xi32>
+  }
+}
+
+// CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {keep_stablehlo_constant = "true"}, tfl.schema_version = 3 : i32} {
+// CHECK-NEXT:  func.func @main(%arg0: tensor<2x3xi32>) -> tensor<2x3xi32> attributes {tf.entry_function = {inputs = "args_tf_0", outputs = "Identity"}} {
+// CHECK-NEXT:    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.sharding = ""} : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK-NEXT:    %1 = stablehlo.multiply %0, %0 : tensor<2x3xi32>
+// CHECK-NEXT:    return %1 : tensor<2x3xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT: }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 18cb7cebfd60b7..96a7152b550dcf 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -562,15 +562,15 @@ func.func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 // CHECK-LABEL:   func @floordiv_broadcast_i32(
 // CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<2x3xi32>,
 // CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<3xi32>) -> tensor<2x3xi32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<0> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) <{incompatible_shape_error = true}> : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
 // CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() <{value = dense<1> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
 // CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -601,15 +601,15 @@ func.func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>)
 // CHECK-LABEL:   func @floordiv_reverse_broadcast_i32(
 // CHECK-SAME:                                         %[[VAL_0:.*]]: tensor<3xi32>,
 // CHECK-SAME:                                         %[[VAL_1:.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
 // CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) <{incompatible_shape_error = true}> : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
 // CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<3xi32>) -> tensor<3xi32>
 // CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() <{value = dense<1> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
 // CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -669,7 +669,7 @@ func.func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>)
 // CHECK-LABEL:   func @equal(
 // CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<2xi1>
 // CHECK:         }
 func.func @equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
@@ -680,7 +680,7 @@ func.func @equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 // CHECK-LABEL:   func @equal_dynamic(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>,
 // CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
 // CHECK:           return %[[VAL_2]] : tensor<?xi1>
 // CHECK:         }
 func.func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
@@ -691,7 +691,7 @@ func.func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?
 // CHECK-LABEL:   func @equal_broadcast(
 // CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -703,7 +703,7 @@ func.func @equal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> te
 // CHECK-LABEL:   func @equal_broadcast_chlo(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -714,7 +714,7 @@ func.func @equal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) ->
 // CHECK-LABEL:   func @equal_broadcast_no_incompatible_shapes_error(
 // CHECK-SAME:                                                       %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                                                       %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -725,7 +725,7 @@ func.func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %a
 // CHECK-LABEL:   func @equal_incompatible_shape_broadcastable(
 // CHECK-SAME:                                                 %[[VAL_0:.*]]: tensor<?xi32>,
 // CHECK-SAME:                                                 %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
 // CHECK:           return %[[VAL_2]] : tensor<?xi1>
 // CHECK:         }
 func.func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
@@ -743,7 +743,7 @@ func.func @equal_unsupported_compare_type(%arg0: tensor<1xf32>, %arg1: tensor<1x
 // CHECK-LABEL:   func @notequal(
 // CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                   %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<2xi1>
 // CHECK:         }
 func.func @notequal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
@@ -754,7 +754,7 @@ func.func @notequal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1>
 // CHECK-LABEL:   func @notequal_broadcast(
 // CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1x1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -766,7 +766,7 @@ func.func @notequal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) ->
 // CHECK-LABEL:   func @notequal_broadcast_chlo(
 // CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<1xi32>,
 // CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -777,7 +777,7 @@ func.func @notequal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>)
 // CHECK-LABEL:   func @notequal_broadcast_no_incompatible_shapes_error(
 // CHECK-SAME:                                                          %[[VAL_0:.*]]: tensor<2xi32>,
 // CHECK-SAME:                                                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
@@ -788,7 +788,7 @@ func.func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>,
 // CHECK-LABEL:   func @notequal_incompatible_shape_broadcastable(
 // CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<?xi32>,
 // CHECK-SAME:                                                    %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
 // CHECK:           return %[[VAL_2]] : tensor<?xi1>
 // CHECK:         }
 func.func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
@@ -942,7 +942,7 @@ func.func @broadcast_less_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32
 // CHECK-LABEL:   func @concat_v2(
 // CHECK-SAME:                    %[[VAL_0:.*]]: tensor<3x3xf32>,
 // CHECK-SAME:                    %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
 // CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
 // CHECK:           return %[[VAL_3]] : tensor<6x3xf32>
 // CHECK:         }
@@ -954,7 +954,7 @@ func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6
 // CHECK-LABEL:   func @concat_v2_1d_axis(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<3x3xf32>,
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<3x6xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
 // CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
 // CHECK:           return %[[VAL_3]] : tensor<3x6xf32>
 // CHECK:         }
@@ -964,7 +964,7 @@ func.func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) ->
 }
 
 // CHECK-LABEL:   func @const() -> tensor<2xi32> {
-// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:           return %[[VAL_0]] : tensor<2xi32>
 // CHECK:         }
 func.func @const() -> tensor<2xi32> {
@@ -974,7 +974,7 @@ func.func @const() -> tensor<2xi32> {
 
 // CHECK-LABEL:   func @relu(
 // CHECK-SAME:               %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_2]] : tensor<1xi32>
 // CHECK:         }
@@ -986,7 +986,7 @@ func.func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL:   func @relu_unranked(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:           return %[[VAL_2]] : tensor<?xi32>
 // CHECK:         }
@@ -998,8 +998,8 @@ func.func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 
 // CHECK-LABEL:   func @relu6(
 // CHECK-SAME:                %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_4]] : tensor<1xi32>
@@ -1014,8 +1014,8 @@ func.func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL:   func @relu6_unranked(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:           return %[[VAL_4]] : tensor<?xi32>
@@ -1031,9 +1031,9 @@ func.func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 // CHECK-LABEL:   func @relu_grad(
 // CHECK-SAME:                    %[[VAL_0:.*]]: tensor<4x8xf32>,
 // CHECK-SAME:                    %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<4x8xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Greater"(%[[VAL_1]], %[[VAL_2]]) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x8xf32>} : () -> tensor<4x8xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<4x8xf32>}> : () -> tensor<4x8xf32>
 // CHECK:           %[[VAL_5:.*]] = "tf.Select"(%[[VAL_3]], %[[VAL_0]], %[[VAL_4]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
 // CHECK:           return %[[VAL_5]] : tensor<4x8xf32>
 // CHECK:         }
@@ -1133,9 +1133,9 @@ func.func @selectv2_broadcasted_condition(%arg0: tensor<1x1xi1>, %arg1: tensor<1
 
 // CHECK-LABEL:   func @transpose_2d(
 // CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
 // CHECK:           return %[[VAL_4]] : tensor<3x2xf32>
 // CHECK:         }
@@ -1148,9 +1148,9 @@ func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
 
 // CHECK-LABEL:   func @transpose_3d_int32(
 // CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
 // CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
 // CHECK:         }
@@ -1163,9 +1163,9 @@ func.func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 
 // CHECK-LABEL:   func @transpose_3d(
 // CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
 // CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
 // CHECK:         }
@@ -1178,9 +1178,9 @@ func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 
 // CHECK-LABEL:   func @transpose_dynamic_2d(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<?x4xf32>) -> tensor<4x?xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<?x4xf32>, tensor<2xi64>) -> tensor<4x?xf32>
 // CHECK:           return %[[VAL_4]] : tensor<4x?xf32>
 // CHECK:         }
@@ -1193,9 +1193,9 @@ func.func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
 
 // CHECK-LABEL:   func @transpose_unranked_2d(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<*xf32>, tensor<2xi64>) -> tensor<*xf32>
 // CHECK:           return %[[VAL_4]] : tensor<*xf32>
 // CHECK:         }
@@ -1488,9 +1488,9 @@ func.func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL:   func @sigmoid(
 // CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK:           %[[VAL_4:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
 // CHECK:           %[[VAL_5:.*]] = "tf.Tanh"(%[[VAL_4]]) : (tensor<2xf32>) -> tensor<2xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Mul"(%[[VAL_5]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -1671,10 +1671,10 @@ func.func @bitcast_same_widths(%arg0: tensor<2xf32>) -> tensor<2xi32> {
 // CHECK-LABEL:   func @sign(
 // CHECK-SAME:               %[[VAL_0:.*]]: tensor<1x2x3x4xf32>,
 // CHECK-SAME:               %[[VAL_1:.*]]: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           %[[VAL_5:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1x2x3x4xf32>}> : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) <{incompatible_shape_error = true}> : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_5:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1x2x3x4xf32>}> : () -> tensor<1x2x3x4xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Sign"(%[[VAL_0]]) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
 // CHECK:           %[[VAL_7:.*]] = "tf.Select"(%[[VAL_4]], %[[VAL_5]], %[[VAL_6]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
 // CHECK:           %[[VAL_8:.*]] = "tf.Select"(%[[VAL_2]], %[[VAL_3]], %[[VAL_7]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
@@ -1693,7 +1693,7 @@ func.func @sign(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x3x4xf32>) -> tenso
 
 // CHECK-LABEL:   func @size_rank_one_i32(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           return %[[VAL_1]] : tensor<i32>
 // CHECK:         }
 func.func @size_rank_one_i32(%arg0: tensor<f32>) -> tensor<i32> {
@@ -1703,7 +1703,7 @@ func.func @size_rank_one_i32(%arg0: tensor<f32>) -> tensor<i32> {
 
 // CHECK-LABEL:   func @size_rank_one_i64(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i64> {
-// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
 // CHECK:           return %[[VAL_1]] : tensor<i64>
 // CHECK:         }
 func.func @size_rank_one_i64(%arg0: tensor<f32>) -> tensor<i64> {
@@ -1724,7 +1724,7 @@ func.func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcompl
 
 // CHECK-LABEL:   func @convert_i32_f32(
 // CHECK-SAME:                          %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.Cast"(%[[VAL_0]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Cast"(%[[VAL_0]]) <{Truncate = false}> : (tensor<2xi32>) -> tensor<2xf32>
 // CHECK:           return %[[VAL_1]] : tensor<2xf32>
 // CHECK:         }
 func.func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
@@ -1734,11 +1734,11 @@ func.func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 
 // CHECK-LABEL:   func @convert_slice(
 // CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 4672]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<1> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<[0, 4153]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 4672]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.StridedSlice"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]])
-// CHECK-SAME:          {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}
+// CHECK-SAME:          <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}>
 // CHECK-SAME:          (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
 // CHECK:           return %[[VAL_4]] : tensor<1x519xf32>
 // CHECK:         }
@@ -1786,7 +1786,7 @@ func.func @round_nearest_even(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<1xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[256, 1]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<256x1xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_3]]) {adj_x = false, adj_y = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_3]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<1> : tensor<1xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1xf32>
@@ -1803,7 +1803,7 @@ func.func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
 // CHECK:           %[[VAL_4:.*]] = arith.constant dense<[256, 1]> : tensor<2xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<256x1xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_5]]) {adj_x = false, adj_y = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_5]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           %[[VAL_7:.*]] = arith.constant dense<> : tensor<0xi64>
 // CHECK:           %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_6]], %[[VAL_7]]) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
 // CHECK:           return %[[VAL_8]] : tensor<f32>
@@ -1816,7 +1816,7 @@ func.func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) ->
 // CHECK-LABEL:   func @convert_dot_2d_2d(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_1]]) {adj_x = false, adj_y = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x1xf32>
 // CHECK:         }
 func.func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
@@ -1861,9 +1861,9 @@ func.func @dynamic_broadcast_in_dim_tf_style(%arg0: tensor<?x1x1x2x1xf32>, %arg1
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_general_case_expand_back_dims(
 // CHECK-SAME:                               %[[ARG_0:.*]]: tensor<?x3000xf32>,
 // CHECK-SAME:                               %[[ARG_1:.*]]: tensor<4xi32>) -> tensor<?x3000x2x4xf32> {
-// CHECK          %[[CST_0:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+// CHECK          %[[CST_0:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
 // CHECK          %[[VAL_0:.*]] = "tf.ExpandDims"(%[[ARG_0]], %[[CST_0]]) : (tensor<?x3000xf32>, tensor<i64>) -> tensor<?x3000x1xf32>
-// CHECK          %[[CST_1:.*]] = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+// CHECK          %[[CST_1:.*]] = "tf.Const"() <{value = dense<3> : tensor<i64>}> : () -> tensor<i64>
 // CHECK          %[[VAL_1:.*]] = "tf.ExpandDims"(%[[VAL_0]], %[[CST_1]]) : (tensor<?x3000x1xf32>, tensor<i64>) -> tensor<?x3000x1x1xf32>
 // CHECK          %[[VAL_2:.*]] = "tf.BroadcastTo"(%[[VAL_1]], %[[ARG_1]]) : (tensor<?x3000x1x1xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
 // CHECK          return %[[VAL_2]] : tensor<?x3000x2x4xf32>
@@ -1875,7 +1875,7 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_back_dims(%arg0: tensor<
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_general_case_expand_middle_dim(
 // CHECK-SAME:                               %[[ARG_0:.*]]: tensor<?x750x768xf32>,
 // CHECK-SAME:                               %[[ARG_1:.*]]: tensor<4xi32>) -> tensor<?x750x1x768xf32> {
-// CHECK          %[[CST_0:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+// CHECK          %[[CST_0:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
 // CHECK          %[[VAL_0:.*]] = "tf.ExpandDims"(%[[ARG_0]], %[[CST_0]]) : (tensor<?x750x768xf32>, tensor<i64>) -> tensor<?x750x1x768xf32>
 // CHECK          %[[VAL_1:.*]] = "tf.BroadcastTo"(%[[VAL_0]], %[[ARG_1]]) : (tensor<?x750x1x768xf32>, tensor<4xi32>) -> tensor<?x750x1x768xf32>
 // CHECK          return %[[VAL_1]] : tensor<?x750x1x768xf32>
@@ -1887,15 +1887,15 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_middle_dim(%arg0: tensor
 // CHECK-LABEL:   func @convert_dot_general(
 // CHECK-SAME:                              %[[VAL_0:.*]]: tensor<3x2x6x5x1xf32>,
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 3, 4, 1, 2]> : tensor<5xi64>} : () -> tensor<5xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[0, 3, 4, 1, 2]> : tensor<5xi64>}> : () -> tensor<5xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3x2x6x5x1xf32>, tensor<5xi64>) -> tensor<3x5x1x2x6xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x2x4x6xf32>, tensor<4xi64>) -> tensor<3x2x6x4xf32>
 // CHECK:           %[[VAL_6:.*]] = arith.constant dense<[3, 5, 12]> : tensor<3xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_6]]) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
 // CHECK:           %[[VAL_8:.*]] = arith.constant dense<[3, 12, 4]> : tensor<3xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Reshape"(%[[VAL_5]], %[[VAL_8]]) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
-// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV3"(%[[VAL_7]], %[[VAL_9]]) {adj_x = false, adj_y = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV3"(%[[VAL_7]], %[[VAL_9]]) <{adj_x = false, adj_y = false}> : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
 // CHECK:           %[[VAL_11:.*]] = arith.constant dense<[3, 5, 1, 4]> : tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Reshape"(%[[VAL_10]], %[[VAL_11]]) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
 // CHECK:           return %[[VAL_12]] : tensor<3x5x1x4xf32>
@@ -1929,7 +1929,7 @@ func.func @quantized_dot_general_not_converted(%arg0: tensor<1x1x512xf32>, %arg1
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<1024x1024xf32>) -> tensor<1x1x1024xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[1, 1024]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : {{.*}} -> tensor<1x1024xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) {adj_x = false, adj_y = false} : {{.*}} -> tensor<1x1024xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : {{.*}} -> tensor<1x1024xf32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<[1, 1, 1024]> : tensor<3xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : {{.*}} -> tensor<1x1x1024xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1x1x1024xf32>
@@ -1952,7 +1952,7 @@ func.func @convert_dot_general_repeated(%arg0: tensor<1x1x1024xf32>, %arg1: tens
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<256x8xi8>) -> tensor<8xi32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[1, 256]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xi8>, tensor<2xi64>) -> tensor<1x256xi8>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) {adj_x = false, adj_y = false} : (tensor<1x256xi8>, tensor<256x8xi8>) -> tensor<1x8xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xi8>, tensor<256x8xi8>) -> tensor<1x8xi32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<8> : tensor<1xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x8xi32>, tensor<1xi64>) -> tensor<8xi32>
 // CHECK:           return %[[VAL_6]] : tensor<8xi32>
@@ -1970,26 +1970,26 @@ func.func @convert_dot_general_int8(%arg0: tensor<256xi8>, %arg1: tensor<256x8xi
 // CHECK-LABEL:   func @convert_dot_general_dynamic_rhs_out_dim(
 // CHECK-SAME:                              %arg0: tensor<4x4x256xf32>,
 // CHECK-SAME:                              %arg1: tensor<4x?x256xf32>) -> tensor<4x4x?xf32> {
-// CHECK-DAG:       %cst = "tf.Const"() {value = dense<[0, 2, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-DAG:       %cst = "tf.Const"() <{value = dense<[0, 2, 1]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:           %0 = "tf.Transpose"(%arg1, %cst) : (tensor<4x?x256xf32>, tensor<3xi64>) -> tensor<4x256x?xf32>
 // CHECK:           %1 = "tf.Shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-DAG:       %cst_0 = "tf.Const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_1 = "tf.Const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_0 = "tf.Const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_1 = "tf.Const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_2 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %2 = "tf.UnsortedSegmentProd"(%1, %cst_0, %cst_2) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %3 = "tf.UnsortedSegmentProd"(%1, %cst_1, %cst_2) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_3 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG:       %cst_4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_3 = "tf.Const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG:       %cst_4 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %4 = "tf.Concat"(%cst_4, %cst_3, %3, %2) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %5 = "tf.Reshape"(%0, %4) : (tensor<4x256x?xf32>, tensor<3xi32>) -> tensor<4x256x?xf32>
-// CHECK:           %6 = "tf.BatchMatMulV3"(%arg0, %5) {adj_x = false, adj_y = false} : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
+// CHECK:           %6 = "tf.BatchMatMulV3"(%arg0, %5) <{adj_x = false, adj_y = false}> : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
 // CHECK:           %7 = "tf.Shape"(%arg0) : (tensor<4x4x256xf32>) -> tensor<3xi32>
 // CHECK:           %8 = "tf.Shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-DAG:       %cst_5 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %9 = "tf.Gather"(%7, %cst_5) {validate_indices = true} : (tensor<3xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-DAG:       %cst_6 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %10 = "tf.Gather"(%8, %cst_6) {validate_indices = true} : (tensor<3xi32>, tensor<1xi64>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_7 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_5 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %9 = "tf.Gather"(%7, %cst_5) <{validate_indices = true}> : (tensor<3xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-DAG:       %cst_6 = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %10 = "tf.Gather"(%8, %cst_6) <{validate_indices = true}> : (tensor<3xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %cst_7 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %11 = "tf.Concat"(%cst_7, %9, %10) : (tensor<i32>, tensor<2xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %12 = "tf.Reshape"(%6, %11) : (tensor<4x4x?xf32>, tensor<3xi32>) -> tensor<4x4x?xf32>
 // CHECK:           return %12 : tensor<4x4x?xf32>
@@ -2008,38 +2008,38 @@ func.return %0 : tensor<4x4x?xf32>
 // CHECK-LABEL:   func @convert_dot_general_dynamic_batch_dim(
 // CHECK-SAME:                              %arg0: tensor<2x?x2x3xf32>,
 // CHECK-SAME:                              %arg1: tensor<2x?x4x3xf32>) -> tensor<2x?x2x4xf32> {
-// CHECK-DAG:       %cst = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %cst = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %0 = "tf.Transpose"(%arg1, %cst) : (tensor<2x?x4x3xf32>, tensor<4xi64>) -> tensor<2x?x3x4xf32>
 // CHECK:           %1 = "tf.Shape"(%arg0) : (tensor<2x?x2x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_0 = "tf.Const"() {value = dense<[-1, -1, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_1 = "tf.Const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_0 = "tf.Const"() <{value = dense<[-1, -1, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_1 = "tf.Const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_2 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %2 = "tf.UnsortedSegmentProd"(%1, %cst_0, %cst_2) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %3 = "tf.UnsortedSegmentProd"(%1, %cst_1, %cst_2) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_3 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %4 = "tf.Gather"(%1, %cst_3) {validate_indices = true} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-DAG:       %cst_4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_3 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %4 = "tf.Gather"(%1, %cst_3) <{validate_indices = true}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-DAG:       %cst_4 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %5 = "tf.Concat"(%cst_4, %4, %2, %3) : (tensor<i32>, tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK:           %6 = "tf.Reshape"(%arg0, %5) : (tensor<2x?x2x3xf32>, tensor<4xi32>) -> tensor<2x?x2x3xf32>
 // CHECK:           %7 = "tf.Shape"(%arg1) : (tensor<2x?x4x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_5 = "tf.Const"() {value = dense<[-1, -1, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_6 = "tf.Const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_5 = "tf.Const"() <{value = dense<[-1, -1, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_6 = "tf.Const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_7 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %8 = "tf.UnsortedSegmentProd"(%7, %cst_5, %cst_7) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %9 = "tf.UnsortedSegmentProd"(%7, %cst_6, %cst_7) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_8 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %10 = "tf.Gather"(%7, %cst_8) {validate_indices = true} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-DAG:       %cst_9 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_8 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %10 = "tf.Gather"(%7, %cst_8) <{validate_indices = true}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-DAG:       %cst_9 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %11 = "tf.Concat"(%cst_9, %10, %9, %8) : (tensor<i32>, tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK:           %12 = "tf.Reshape"(%0, %11) : (tensor<2x?x3x4xf32>, tensor<4xi32>) -> tensor<2x?x3x4xf32>
-// CHECK:           %13 = "tf.BatchMatMulV3"(%6, %12) {adj_x = false, adj_y = false} : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
+// CHECK:           %13 = "tf.BatchMatMulV3"(%6, %12) <{adj_x = false, adj_y = false}> : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
 // CHECK:           %14 = "tf.Shape"(%arg0) : (tensor<2x?x2x3xf32>) -> tensor<4xi32>
 // CHECK:           %15 = "tf.Shape"(%arg1) : (tensor<2x?x4x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_10 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           %16 = "tf.Gather"(%14, %cst_10) {validate_indices = true} : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
-// CHECK:           %cst_11 = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %17 = "tf.Gather"(%15, %cst_11) {validate_indices = true} : (tensor<4xi32>, tensor<1xi64>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_12 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_10 = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK:           %16 = "tf.Gather"(%14, %cst_10) <{validate_indices = true}> : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
+// CHECK:           %cst_11 = "tf.Const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %17 = "tf.Gather"(%15, %cst_11) <{validate_indices = true}> : (tensor<4xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %cst_12 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %18 = "tf.Concat"(%cst_12, %16, %17) : (tensor<i32>, tensor<3xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK:           %19 = "tf.Reshape"(%13, %18) : (tensor<2x?x2x4xf32>, tensor<4xi32>) -> tensor<2x?x2x4xf32>
 // CHECK:           return %19 : tensor<2x?x2x4xf32>
@@ -2058,36 +2058,36 @@ func.return %0 : tensor<2x?x2x4xf32>
 // CHECK-LABEL:   func @convert_dot_general_dynamic_lhs_rhs_out_dims(
 // CHECK-SAME:                              %arg0: tensor<2x2x?x3xf32>,
 // CHECK-SAME:                              %arg1: tensor<2x4x?x3xf32>) -> tensor<2x2x?x4x?xf32> {
-// CHECK-DAG:       %cst = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %cst = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %0 = "tf.Transpose"(%arg1, %cst) : (tensor<2x4x?x3xf32>, tensor<4xi64>) -> tensor<2x3x4x?xf32>
 // CHECK:           %1 = "tf.Shape"(%arg0) : (tensor<2x2x?x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_0 = "tf.Const"() {value = dense<[-1, 0, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_1 = "tf.Const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_0 = "tf.Const"() <{value = dense<[-1, 0, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_1 = "tf.Const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_2 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %2 = "tf.UnsortedSegmentProd"(%1, %cst_0, %cst_2) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %3 = "tf.UnsortedSegmentProd"(%1, %cst_1, %cst_2) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_3 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG:       %cst_4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_3 = "tf.Const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG:       %cst_4 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %4 = "tf.Concat"(%cst_4, %cst_3, %2, %3) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %5 = "tf.Reshape"(%arg0, %4) : (tensor<2x2x?x3xf32>, tensor<3xi32>) -> tensor<2x?x3xf32>
 // CHECK:           %6 = "tf.Shape"(%arg1) : (tensor<2x4x?x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_5 = "tf.Const"() {value = dense<[-1, 0, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_6 = "tf.Const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       %cst_7 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_5 = "tf.Const"() <{value = dense<[-1, 0, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_6 = "tf.Const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       %cst_7 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %7 = "tf.UnsortedSegmentProd"(%6, %cst_5, %cst_7) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %8 = "tf.UnsortedSegmentProd"(%6, %cst_6, %cst_7) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_8 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG:       %cst_9 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_8 = "tf.Const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG:       %cst_9 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %9 = "tf.Concat"(%cst_9, %cst_8, %8, %7) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %10 = "tf.Reshape"(%0, %9) : (tensor<2x3x4x?xf32>, tensor<3xi32>) -> tensor<2x3x?xf32>
-// CHECK:           %11 = "tf.BatchMatMulV3"(%5, %10) {adj_x = false, adj_y = false} : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
+// CHECK:           %11 = "tf.BatchMatMulV3"(%5, %10) <{adj_x = false, adj_y = false}> : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
 // CHECK:           %12 = "tf.Shape"(%arg0) : (tensor<2x2x?x3xf32>) -> tensor<4xi32>
 // CHECK:           %13 = "tf.Shape"(%arg1) : (tensor<2x4x?x3xf32>) -> tensor<4xi32>
-// CHECK-DAG:       %cst_10 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           %14 = "tf.Gather"(%12, %cst_10) {validate_indices = true} : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
-// CHECK-DAG:       %cst_11 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %15 = "tf.Gather"(%13, %cst_11) {validate_indices = true} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-DAG:       %cst_12 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_10 = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK:           %14 = "tf.Gather"(%12, %cst_10) <{validate_indices = true}> : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
+// CHECK-DAG:       %cst_11 = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %15 = "tf.Gather"(%13, %cst_11) <{validate_indices = true}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-DAG:       %cst_12 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %16 = "tf.Concat"(%cst_12, %14, %15) : (tensor<i32>, tensor<3xi32>, tensor<2xi32>) -> tensor<5xi32>
 // CHECK:           %17 = "tf.Reshape"(%11, %16) : (tensor<2x?x?xf32>, tensor<5xi32>) -> tensor<2x2x?x4x?xf32>
 // CHECK:           return %17 : tensor<2x2x?x4x?xf32>
@@ -2107,26 +2107,26 @@ func.return %0 : tensor<2x2x?x4x?xf32>
 // CHECK-SAME:                              %arg0: tensor<4x4x?xf32>,
 // CHECK-SAME:                              %arg1: tensor<4x?x256xf32>) -> tensor<4x4x256xf32> {
 // CHECK:           %0 = "tf.Shape"(%arg0) : (tensor<4x4x?xf32>) -> tensor<3xi32>
-// CHECK-DAG:       %cst = "tf.Const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_0 = "tf.Const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst = "tf.Const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_0 = "tf.Const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_1 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %1 = "tf.UnsortedSegmentProd"(%0, %cst, %cst_1) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %2 = "tf.UnsortedSegmentProd"(%0, %cst_0, %cst_1) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_2 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG:       %cst_3 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_2 = "tf.Const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG:       %cst_3 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %3 = "tf.Concat"(%cst_3, %cst_2, %1, %2) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %4 = "tf.Reshape"(%arg0, %3) : (tensor<4x4x?xf32>, tensor<3xi32>) -> tensor<4x4x?xf32>
 // CHECK:           %5 = "tf.Shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-DAG:       %cst_4 = "tf.Const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_5 = "tf.Const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %cst_6 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_4 = "tf.Const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_5 = "tf.Const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-DAG:       %cst_6 = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %6 = "tf.UnsortedSegmentProd"(%5, %cst_4, %cst_6) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK:           %7 = "tf.UnsortedSegmentProd"(%5, %cst_5, %cst_6) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:       %cst_7 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG:       %cst_8 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %cst_7 = "tf.Const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG:       %cst_8 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %8 = "tf.Concat"(%cst_8, %cst_7, %7, %6) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %9 = "tf.Reshape"(%arg1, %8) : (tensor<4x?x256xf32>, tensor<3xi32>) -> tensor<4x?x256xf32>
-// CHECK:           %10 = "tf.BatchMatMulV3"(%4, %9) {adj_x = false, adj_y = false} : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
+// CHECK:           %10 = "tf.BatchMatMulV3"(%4, %9) <{adj_x = false, adj_y = false}> : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
 // CHECK:           return %10 : tensor<4x4x256xf32>
 // CHECK:           }
 func.func @convert_dot_general_dynamic_contracting_dim(%arg0: tensor<4x4x?xf32>, %arg1: tensor<4x?x256xf32>) -> tensor<4x4x256xf32> {
@@ -2145,14 +2145,14 @@ func.return %0 : tensor<4x4x256xf32>
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[16, 32, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<16x32x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<16x32x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
-// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<16x32x1x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[16, 32, 256]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<16x32x256x1xbf16>, tensor<3xi64>) -> tensor<16x32x256xbf16>
@@ -2177,14 +2177,14 @@ func.func @convert_conv1d(%arg0: tensor<16x32x256xbf16>, %arg1: tensor<1x256x256
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<?x32x256xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[-9223372036854775808, 32, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<?x32x256xbf16>, tensor<4xi64>) -> tensor<?x32x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<?x32x256x1xbf16>, tensor<4xi64>) -> tensor<?x32x1x256xbf16>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<?x32x1x256xbf16>
-// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<?x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<?x32x1x256xbf16>, tensor<4xi64>) -> tensor<?x32x256x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[-9223372036854775808, 32, 256]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<?x32x256x1xbf16>, tensor<3xi64>) -> tensor<?x32x256xbf16>
@@ -2211,14 +2211,14 @@ func.func @convert_conv1d_dynamic_batch(%arg0: tensor<?x32x256xbf16>, %arg1: ten
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[16, 32, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<16x32x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<16x32x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
-// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<16x32x1x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[16, 32, 256]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<16x32x256x1xbf16>, tensor<3xi64>) -> tensor<16x32x256xbf16>
@@ -2240,14 +2240,14 @@ func.func @convert_conv1d_no_lhs_dil_rhs_dil_precision_conf(%arg0: tensor<16x32x
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<?x32x256xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[-9223372036854775808, 32, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<?x32x256xbf16>, tensor<4xi64>) -> tensor<?x32x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<?x32x256x1xbf16>, tensor<4xi64>) -> tensor<?x32x1x256xbf16>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<?x32x1x256xbf16>
-// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<?x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<?x32x1x256xbf16>, tensor<4xi64>) -> tensor<?x32x256x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[-9223372036854775808, 32, 256]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<?x32x256x1xbf16>, tensor<3xi64>) -> tensor<?x32x256xbf16>
@@ -2271,14 +2271,14 @@ func.func @convert_conv1d_no_lhs_dil_rhs_dil_precision_conf_dynamic_batch(%arg0:
 // CHECK-SAME:                                                              %[[VAL_1:.*]]: tensor<256x1x256xbf16>) -> tensor<256x16x32xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[32, 16, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<32x16x256xbf16>, tensor<4xi64>) -> tensor<32x16x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[1, 0, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<32x16x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
 // CHECK:           %[[VAL_6:.*]] = arith.constant dense<[256, 1, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<256x1x256xbf16>, tensor<4xi64>) -> tensor<256x1x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[1, 3, 2, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[1, 3, 2, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<256x1x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
-// CHECK-DAG:       %[[VAL_11:.*]] = "tf.Const"() {value = dense<[3, 0, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
+// CHECK-DAG:       %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[3, 0, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<16x32x1x256xbf16>, tensor<4xi64>) -> tensor<256x16x32x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[256, 16, 32]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<256x16x32x1xbf16>, tensor<3xi64>) -> tensor<256x16x32xbf16>
@@ -2343,14 +2343,14 @@ func.func @no_convert_conv1d_feature_group_gt_1(%arg0: tensor<16x32x256xbf16>, %
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[16, 32, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<16x32x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<16x32x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
-// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
-// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
-// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<16x32x1x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
 // CHECK:           %[[VAL_13:.*]] = arith.constant dense<[16, 32, 256]> : tensor<3xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<16x32x256x1xbf16>, tensor<3xi64>) -> tensor<16x32x256xbf16>
@@ -2372,7 +2372,7 @@ func.func @convert_conv1d_missing_windows_strides_fallback(%arg0: tensor<16x32x2
 // CHECK-LABEL:   func.func @convert_conv1d_missing_windows_strides_fallback_2(
 // CHECK-SAME:                              %[[VAL_0:.*]]: tensor<1x64x64x4xbf16>,
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x64x64x4xbf16>, tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x64x64x4xbf16>, tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16>
 // CHECK:           return %[[VAL_2]] : tensor<1x62x62x320xbf16>
 // CHECK:         }
 func.func @convert_conv1d_missing_windows_strides_fallback_2(%arg0: tensor<1x64x64x4xbf16>, %arg1: tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16> {
@@ -2391,7 +2391,7 @@ func.func @convert_conv1d_missing_windows_strides_fallback_2(%arg0: tensor<1x64x
 // CHECK-LABEL:   func @convert_conv2d(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
 // CHECK:         }
 func.func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
@@ -2414,7 +2414,7 @@ func.func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
 // CHECK-LABEL:   func @convert_group_conv2d(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x14x14x2240xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x112x2240xf32>) -> tensor<1x7x7x2240xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x14x14x2240xf32>, tensor<3x3x112x2240xf32>) -> tensor<1x7x7x2240xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x14x14x2240xf32>, tensor<3x3x112x2240xf32>) -> tensor<1x7x7x2240xf32>
 // CHECk:           return %[[VAL_2]] : tensor<1x7x7x2240xf32>
 // CHECK:         }
 func.func @convert_group_conv2d(%arg0: tensor<1x14x14x2240xf32>, %arg1: tensor<3x3x112x2240xf32>) -> tensor<1x7x7x2240xf32> {
@@ -2428,13 +2428,13 @@ func.func @convert_group_conv2d(%arg0: tensor<1x14x14x2240xf32>, %arg1: tensor<3
 // CHECK-LABEL:    func.func @convert_transpose_conv_with_transpose(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x256x64x64xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<2x2x64x256xf32>) -> tensor<1x64x128x128xf32> {
-// CHECK:            %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:            %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0:.*]], %[[VAL_2:.*]]) : (tensor<1x256x64x64xf32>, tensor<4xi64>) -> tensor<1x64x64x256xf32>
-// CHECK:            %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:            %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:            %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_1:.*]], %[[VAL_4:.*]]) : (tensor<2x2x64x256xf32>, tensor<2xi64>) -> tensor<2x2x64x256xf32>
-// CHECK:            %[[VAL_6:.*]] = "tf.Const"() {value = dense<[1, 128, 128, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
-// CHECK:            %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_6:.*]] = "tf.Const"() <{value = dense<[1, 128, 128, 64]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
+// CHECK:            %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:            %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7:.*]], %[[VAL_8:.*]]) : (tensor<1x128x128x64xf32>, tensor<4xi64>) -> tensor<1x64x128x128xf32>
 // CHECK:            return %[[VAL_9:.*]] : tensor<1x64x128x128xf32>
 // CHECK:           }
@@ -2450,13 +2450,13 @@ func.func @convert_transpose_conv_with_transpose(%arg0: tensor<1x256x64x64xf32>,
 // CHECK-LABEL:    func.func @convert_transpose_conv_with_transpose2(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<64x64x1x256xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<2x2x64x256xf32>) -> tensor<128x128x1x64xf32> {
-// CHECK:            %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:            %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0:.*]], %[[VAL_2:.*]]) : (tensor<64x64x1x256xf32>, tensor<4xi64>) -> tensor<1x64x64x256xf32>
-// CHECK:            %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:            %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:            %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_1:.*]], %[[VAL_4:.*]]) : (tensor<2x2x64x256xf32>, tensor<2xi64>) -> tensor<2x2x64x256xf32>
-// CHECK:            %[[VAL_6:.*]] = "tf.Const"() {value = dense<[1, 128, 128, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
-// CHECK:            %[[VAL_8:.*]] = "tf.Const"() {value = dense<[1, 2, 0, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:            %[[VAL_6:.*]] = "tf.Const"() <{value = dense<[1, 128, 128, 64]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:            %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6:.*]], %[[VAL_5:.*]], %[[VAL_3:.*]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<4xi32>, tensor<2x2x64x256xf32>, tensor<1x64x64x256xf32>) -> tensor<1x128x128x64xf32>
+// CHECK:            %[[VAL_8:.*]] = "tf.Const"() <{value = dense<[1, 2, 0, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:            %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7:.*]], %[[VAL_8:.*]]) : (tensor<1x128x128x64xf32>, tensor<4xi64>) -> tensor<128x128x1x64xf32>
 // CHECK:            return %[[VAL_9:.*]] : tensor<128x128x1x64xf32>
 // CHECK:           }
@@ -2473,7 +2473,7 @@ func.func @convert_transpose_conv_with_transpose2(%arg0: tensor<64x64x1x256xf32>
 // CHECK-LABEL:   func @convert_conv2d_dynamic_batch(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<?x8x8x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_dynamic_batch(%arg0: tensor<?x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32> {
@@ -2496,7 +2496,7 @@ func.func @convert_conv2d_dynamic_batch(%arg0: tensor<?x8x8x207xf32>, %arg1: ten
 // CHECK-LABEL:   func @convert_conv2d_no_padding(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x6x6x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x4x4x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x6x6x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x4x4x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x6x6x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x4x4x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x4x4x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_no_padding(%arg0: tensor<1x6x6x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x4x4x16xf32> {
@@ -2519,7 +2519,7 @@ func.func @convert_conv2d_no_padding(%arg0: tensor<1x6x6x207xf32>, %arg1: tensor
 // CHECK-LABEL:   func @convert_conv2d_no_rhs_dilation(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_no_rhs_dilation(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
@@ -2542,7 +2542,7 @@ func.func @convert_conv2d_no_rhs_dilation(%arg0: tensor<1x8x8x207xf32>, %arg1: t
 // CHECK-LABEL:   func @convert_conv2d_no_window_strides(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_no_window_strides(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
@@ -2565,7 +2565,7 @@ func.func @convert_conv2d_no_window_strides(%arg0: tensor<1x8x8x207xf32>, %arg1:
 // CHECK-LABEL:   func @convert_conv2d_no_lhs_dilation(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_no_lhs_dilation(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
@@ -2588,12 +2588,12 @@ func.func @convert_conv2d_no_lhs_dilation(%arg0: tensor<1x8x8x207xf32>, %arg1: t
 // CHECK-LABEL:   func @convert_conv2d_with_transpose(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x8x1x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<8x8x1x207xf32>, tensor<4xi64>) -> tensor<1x8x8x207xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x3x16x207xf32>, tensor<4xi64>) -> tensor<3x3x207x16xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.Conv2D"(%[[VAL_3]], %[[VAL_5]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           %[[VAL_7:.*]] = "tf.Const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Conv2D"(%[[VAL_3]], %[[VAL_5]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Const"() <{value = dense<[3, 1, 2, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_8:.*]] = "tf.Transpose"(%[[VAL_6]], %[[VAL_7]]) : (tensor<1x8x8x16xf32>, tensor<4xi64>) -> tensor<16x8x8x1xf32>
 // CHECK:           return %[[VAL_8]] : tensor<16x8x8x1xf32>
 // CHECK:         }
@@ -2617,12 +2617,12 @@ func.func @convert_conv2d_with_transpose(%arg0: tensor<8x8x1x207xf32>, %arg1: te
 // CHECK-LABEL:   func @convert_conv2d_with_transpose_dynamic_batch(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x8x?x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x16x207xf32>) -> tensor<16x8x8x?xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<8x8x?x207xf32>, tensor<4xi64>) -> tensor<?x8x8x207xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x3x16x207xf32>, tensor<4xi64>) -> tensor<3x3x207x16xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.Conv2D"(%[[VAL_3]], %[[VAL_5]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32>
-// CHECK:           %[[VAL_7:.*]] = "tf.Const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Conv2D"(%[[VAL_3]], %[[VAL_5]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x8x8x16xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Const"() <{value = dense<[3, 1, 2, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[VAL_8:.*]] = "tf.Transpose"(%[[VAL_6]], %[[VAL_7]]) : (tensor<?x8x8x16xf32>, tensor<4xi64>) -> tensor<16x8x8x?xf32>
 // CHECK:           return %[[VAL_8]] : tensor<16x8x8x?xf32>
 // CHECK:         }
@@ -2646,7 +2646,7 @@ func.func @convert_conv2d_with_transpose_dynamic_batch(%arg0: tensor<8x8x?x207xf
 // CHECK-LABEL:   func @convert_conv2d_explicit_padding(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<64x8x8x8xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<8x8x8x64xf32>) -> tensor<64x3x3x64xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<64x8x8x8xf32>, tensor<8x8x8x64xf32>) -> tensor<64x3x3x64xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<64x8x8x8xf32>, tensor<8x8x8x64xf32>) -> tensor<64x3x3x64xf32>
 // CHECK:           return %[[VAL_2]] : tensor<64x3x3x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_explicit_padding(%arg0: tensor<64x8x8x8xf32>, %arg1: tensor<8x8x8x64xf32>) -> tensor<64x3x3x64xf32> {
@@ -2670,7 +2670,7 @@ func.func @convert_conv2d_explicit_padding(%arg0: tensor<64x8x8x8xf32>, %arg1: t
 // CHECK-LABEL:   func @convert_conv2d_explicit_padding_dynamic_batch(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?x8x8x8xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<8x8x8x64xf32>) -> tensor<?x3x3x64xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x8x8x8xf32>, tensor<8x8x8x64xf32>) -> tensor<?x3x3x64xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 1, 1, 1, 1, 0, 0], padding = "EXPLICIT", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x8x8x8xf32>, tensor<8x8x8x64xf32>) -> tensor<?x3x3x64xf32>
 // CHECK:           return %[[VAL_2]] : tensor<?x3x3x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_explicit_padding_dynamic_batch(%arg0: tensor<?x8x8x8xf32>, %arg1: tensor<8x8x8x64xf32>) -> tensor<?x3x3x64xf32> {
@@ -2694,8 +2694,8 @@ func.func @convert_conv2d_explicit_padding_dynamic_batch(%arg0: tensor<?x8x8x8xf
 // CHECK-LABEL:   func @convert_conv2d_negative_explicit_padding(
 // CHECK-SAME:                         %[[ARG0:.*]]: tensor<128x7x9x64xf32>,
 // CHECK-SAME:                         %[[ARG1:.*]]: tensor<3x2x64x4xf32>) -> tensor<128x4x3x4xf32> {
-// CHECK-DAG:       %[[START:.*]] = "tf.Const"() {value = dense<[0, 0, 5, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
-// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() {value = dense<[128, 5, 4, 64]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[START:.*]] = "tf.Const"() <{value = dense<[0, 0, 5, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() <{value = dense<[128, 5, 4, 64]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[SLICED_ARG0:.*]] = "tf.Slice"(%[[ARG0]], %[[START]], %[[SIZE]])
 // CHECK-SAME:      (tensor<128x7x9x64xf32>, tensor<4xi64>, tensor<4xi64>) -> tensor<128x5x4x64xf32>
 // CHECK:           %[[CONV:.*]] = "tf.Conv2D"(%[[SLICED_ARG0]], %[[ARG1]])
@@ -2724,8 +2724,8 @@ func.func @convert_conv2d_negative_explicit_padding(%arg0: tensor<128x7x9x64xf32
 // CHECK-LABEL:   func @convert_conv2d_negative_explicit_padding_dynamic_batch(
 // CHECK-SAME:                         %[[ARG0:.*]]: tensor<?x7x9x64xf32>,
 // CHECK-SAME:                         %[[ARG1:.*]]: tensor<3x2x64x4xf32>) -> tensor<?x4x3x4xf32> {
-// CHECK-DAG:       %[[START:.*]] = "tf.Const"() {value = dense<[0, 0, 5, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
-// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() {value = dense<[-9223372036854775808, 5, 4, 64]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[START:.*]] = "tf.Const"() <{value = dense<[0, 0, 5, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() <{value = dense<[-9223372036854775808, 5, 4, 64]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:           %[[SLICED_ARG0:.*]] = "tf.Slice"(%[[ARG0]], %[[START]], %[[SIZE]])
 // CHECK-SAME:      (tensor<?x7x9x64xf32>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x5x4x64xf32>
 // CHECK:           %[[CONV:.*]] = "tf.Conv2D"(%[[SLICED_ARG0]], %[[ARG1]])
@@ -2756,7 +2756,7 @@ func.func @convert_conv2d_negative_explicit_padding_dynamic_batch(%arg0: tensor<
 // CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<3x3x1x3312xf32>) -> tensor<1x8x8x3312xf32> {
 // CHECK:           %[[CST:.*]] = arith.constant dense<[3, 3, 207, 16]> : tensor<4xi64>
 // CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_1]], %[[CST]]) : (tensor<3x3x1x3312xf32>, tensor<4xi64>) -> tensor<3x3x207x16xf32>
-// CHECK:           %[[VAL_3:.*]] = "tf.DepthwiseConv2dNative"(%[[VAL_0]], %[[VAL_2]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x3312xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.DepthwiseConv2dNative"(%[[VAL_0]], %[[VAL_2]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x3312xf32>
 // CHECK:           return %[[VAL_3]] : tensor<1x8x8x3312xf32>
 // CHECK:         }
 func.func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x1x3312xf32>) -> tensor<1x8x8x3312xf32> {
@@ -2779,8 +2779,8 @@ func.func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<
 // CHECK-LABEL:   func @convert_conv2d_to_resize(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x56x624x16xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<1x257x16x1xf32>) -> tensor<1x56x904x16xf32> {
-// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() {value = dense<[56, 904]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:           %[[VAL_2:.*]] = "tf.ResizeBilinear"(%[[VAL_0]], %[[SIZE]]) {align_corners = true, half_pixel_centers = false} : (tensor<1x56x624x16xf32>, tensor<2xi32>) -> tensor<1x56x904x16xf32>
+// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() <{value = dense<[56, 904]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.ResizeBilinear"(%[[VAL_0]], %[[SIZE]]) <{align_corners = true, half_pixel_centers = false}> : (tensor<1x56x624x16xf32>, tensor<2xi32>) -> tensor<1x56x904x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x56x904x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_to_resize(%arg0: tensor<1x56x624x16xf32>, %arg1: tensor<1x257x16x1xf32>) -> tensor<1x56x904x16xf32> {
@@ -2798,8 +2798,8 @@ func.func @convert_conv2d_to_resize(%arg0: tensor<1x56x624x16xf32>, %arg1: tenso
 // CHECK-LABEL:   func @convert_conv2d_resize_perferred(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x56x1248x16xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x1x16x1xf32>) -> tensor<1x111x1248x16xf32> {
-// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() {value = dense<[111, 1248]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:           %[[VAL_2:.*]] = "tf.ResizeBilinear"(%[[VAL_0]], %[[SIZE]]) {align_corners = true, half_pixel_centers = false} : (tensor<1x56x1248x16xf32>, tensor<2xi32>) -> tensor<1x111x1248x16xf32>
+// CHECK-DAG:       %[[SIZE:.*]] = "tf.Const"() <{value = dense<[111, 1248]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.ResizeBilinear"(%[[VAL_0]], %[[SIZE]]) <{align_corners = true, half_pixel_centers = false}> : (tensor<1x56x1248x16xf32>, tensor<2xi32>) -> tensor<1x111x1248x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x111x1248x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_resize_perferred(%arg0: tensor<1x56x1248x16xf32>, %arg1: tensor<3x1x16x1xf32>) -> tensor<1x111x1248x16xf32> {
@@ -2817,10 +2817,10 @@ func.func @convert_conv2d_resize_perferred(%arg0: tensor<1x56x1248x16xf32>, %arg
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input_same_pad(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x256x256x2xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<4x4x2x2xf32>) -> tensor<1x512x512x2xf32> {
-// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_4:.*]] = "tf.ReverseV2"(%[[VAL_1]], %[[VAL_3]]) : (tensor<4x4x2x2xf32>, tensor<2xi64>) -> tensor<4x4x2x2xf32>
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 512, 512, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_4]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<4x4x2x2xf32>, tensor<1x256x256x2xf32>) -> tensor<1x512x512x2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 512, 512, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_2]], %[[VAL_4]], %[[VAL_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<4xi32>, tensor<4x4x2x2xf32>, tensor<1x256x256x2xf32>) -> tensor<1x512x512x2xf32>
 // CHECK:           return %[[VAL_5]] : tensor<1x512x512x2xf32>
 // CHECK:         }
 func.func @convert_conv2d_back_prop_input_same_pad(%arg0: tensor<1x256x256x2xf32>, %arg1: tensor<4x4x2x2xf32>) -> tensor<1x512x512x2xf32> {
@@ -2845,10 +2845,10 @@ func.func @convert_conv2d_back_prop_input_negative_pad(%arg0: tensor<1x256x256x2
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x64x32xf32>) -> tensor<8x8x8x64xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.ReverseV2"(%[[VAL_1]], %[[VAL_2]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_4]], %[[VAL_3]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[8, 8, 8, 64]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_4]], %[[VAL_3]], %[[VAL_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
 // CHECK:           return %[[VAL_5]] : tensor<8x8x8x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_back_prop_input(%arg0: tensor<8x4x4x32xf32>, %arg1: tensor<3x3x64x32xf32>) -> tensor<8x8x8x64xf32> {
@@ -2872,12 +2872,12 @@ func.func @convert_conv2d_back_prop_input(%arg0: tensor<8x4x4x32xf32>, %arg1: te
 // CHECK-LABEL:   func @convert_conv2d_back_prop_input_transpose_filter(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<8x4x4x32xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK-DAG:       %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_3]]) : (tensor<3x3x32x64xf32>, tensor<4xi64>) -> tensor<3x3x64x32xf32>
 // CHECK:           %[[VAL_5:.*]] = "tf.ReverseV2"(%[[VAL_4]], %[[VAL_2]]) : (tensor<3x3x64x32xf32>, tensor<2xi64>) -> tensor<3x3x64x32xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.Const"() {value = dense<[8, 8, 8, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6]], %[[VAL_5]], %[[VAL_0]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Const"() <{value = dense<[8, 8, 8, 64]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Conv2DBackpropInput"(%[[VAL_6]], %[[VAL_5]], %[[VAL_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<4xi32>, tensor<3x3x64x32xf32>, tensor<8x4x4x32xf32>) -> tensor<8x8x8x64xf32>
 // CHECK:           return %[[VAL_7]] : tensor<8x8x8x64xf32>
 // CHECK:         }
 func.func @convert_conv2d_back_prop_input_transpose_filter(%arg0: tensor<8x4x4x32xf32>, %arg1: tensor<3x3x32x64xf32>) -> tensor<8x8x8x64xf32> {
@@ -2901,7 +2901,7 @@ func.func @convert_conv2d_back_prop_input_transpose_filter(%arg0: tensor<8x4x4x3
 // CHECK-LABEL:   func @convert_conv2d_valid_padding(
 // CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x6x6x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x6x6x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x6x6x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x6x6x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x6x6x16xf32> {
@@ -2924,7 +2924,7 @@ func.func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: ten
 // CHECK-LABEL:   func @convert_conv2d_valid_padding_dynamic_batch(
 // CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<?x8x8x207xf32>,
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<?x6x6x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x6x6x16xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<?x6x6x16xf32>
 // CHECK:           return %[[VAL_2]] : tensor<?x6x6x16xf32>
 // CHECK:         }
 func.func @convert_conv2d_valid_padding_dynamic_batch(%arg0: tensor<?x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<?x6x6x16xf32> {
@@ -2946,9 +2946,9 @@ func.func @convert_conv2d_valid_padding_dynamic_batch(%arg0: tensor<?x8x8x207xf3
 
 // CHECK-LABEL:   func @convert_reduce_to_prod(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Prod"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Prod"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_3]] : tensor<1xf32>
 // CHECK:         }
 func.func @convert_reduce_to_prod(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
@@ -2963,9 +2963,9 @@ func.func @convert_reduce_to_prod(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
 
 // CHECK-LABEL:   func @convert_reduce_to_sum(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_3]] : tensor<1xf32>
 // CHECK:         }
 func.func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
@@ -2981,8 +2981,8 @@ func.func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
 // CHECK-LABEL:   func @convert_reduce_to_prod_non_constant_init(
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x256xf32>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<f32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_1:.*]] = "tf.Prod"(%[[ARG_0]], %[[VAL_0]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_1:.*]] = "tf.Prod"(%[[ARG_0]], %[[VAL_0]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           %[[VAL_2:.*]] = "tf.Mul"(%[[VAL_1]], %[[ARG_1]]) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1xf32>
 // CHECK:         }
@@ -2999,8 +2999,8 @@ func.func @convert_reduce_to_prod_non_constant_init(%arg0: tensor<1x256xf32>, %a
 // CHECK-LABEL:   func @convert_reduce_to_sum_non_constant_init(
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x256xf32>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<f32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_1:.*]] = "tf.Sum"(%[[ARG_0]], %[[VAL_0]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_1:.*]] = "tf.Sum"(%[[ARG_0]], %[[VAL_0]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           %[[VAL_2:.*]] = "tf.Add"(%[[VAL_1]], %[[ARG_1]]) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1xf32>
 // CHECK:         }
@@ -3015,9 +3015,9 @@ func.func @convert_reduce_to_sum_non_constant_init(%arg0: tensor<1x256xf32>, %ar
 
 // CHECK-LABEL:   func @convert_int_reduce_to_prod(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xi32>) -> tensor<1xi32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Prod"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Prod"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xi32>, tensor<1xi64>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_3]] : tensor<1xi32>
 // CHECK:         }
 func.func @convert_int_reduce_to_prod(%arg0: tensor<1x256xi32>) -> tensor<1xi32> {
@@ -3033,9 +3033,9 @@ func.func @convert_int_reduce_to_prod(%arg0: tensor<1x256xi32>) -> tensor<1xi32>
 
 // CHECK-LABEL:   func @convert_int_reduce_to_sum(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xi32>) -> tensor<1xi32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xi32>, tensor<1xi64>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_3]] : tensor<1xi32>
 // CHECK:         }
 func.func @convert_int_reduce_to_sum(%arg0: tensor<1x256xi32>) -> tensor<1xi32> {
@@ -3050,8 +3050,8 @@ func.func @convert_int_reduce_to_sum(%arg0: tensor<1x256xi32>) -> tensor<1xi32>
 
 // CHECK-LABEL:   func @convert_reduce_to_max(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_3]] : tensor<1xf32>
 // CHECK:         }
 func.func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
@@ -3067,8 +3067,8 @@ func.func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
 
 // CHECK-LABEL:   func @convert_reduce_to_max_int(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x4xi32>) -> tensor<1xi32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x4xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x4xi32>, tensor<1xi64>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_3]] : tensor<1xi32>
 func.func @convert_reduce_to_max_int(%arg0: tensor<1x4xi32>) -> tensor<1xi32> {
   // -2147483648 is MIN for INT32
@@ -3083,8 +3083,8 @@ func.func @convert_reduce_to_max_int(%arg0: tensor<1x4xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL:   func @convert_reduce_to_min(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_3]] : tensor<1xf32>
 // CHECK:         }
 func.func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
@@ -3100,8 +3100,8 @@ func.func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
 
 // CHECK-LABEL:   func @convert_reduce_to_min_int(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x4xi32>) -> tensor<1xi32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x4xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) <{keep_dims = false}> : (tensor<1x4xi32>, tensor<1xi64>) -> tensor<1xi32>
 // CHECK:           return %[[VAL_3]] : tensor<1xi32>
 func.func @convert_reduce_to_min_int(%arg0: tensor<1x4xi32>) -> tensor<1xi32> {
   // 2147483647 is MAX for INT32
@@ -3115,9 +3115,9 @@ func.func @convert_reduce_to_min_int(%arg0: tensor<1x4xi32>) -> tensor<1xi32> {
 }
 
 // CHECK-LABEL:   func @convert_iota_1d() -> tensor<123xf32> {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<1.230000e+02> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1.230000e+02> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<123xf32>
 // CHECK:           return %[[VAL_3]] : tensor<123xf32>
 // CHECK:         }
@@ -3127,13 +3127,13 @@ func.func @convert_iota_1d() -> tensor<123xf32> {
 }
 
 // CHECK-LABEL:   func @convert_iota_3d() -> tensor<5x7x9xi32> {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<7> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<7> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<7xi32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[1, 7, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() <{value = dense<[1, 7, 1]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_4]]) : (tensor<7xi32>, tensor<3xi64>) -> tensor<1x7x1xi32>
-// CHECK:           %[[VAL_6:.*]] = "tf.Const"() {value = dense<[5, 7, 9]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Const"() <{value = dense<[5, 7, 9]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:           %[[VAL_7:.*]] = "tf.BroadcastTo"(%[[VAL_5]], %[[VAL_6]]) : (tensor<1x7x1xi32>, tensor<3xi64>) -> tensor<5x7x9xi32>
 // CHECK:           return %[[VAL_7]] : tensor<5x7x9xi32>
 // CHECK:         }
@@ -3144,7 +3144,7 @@ func.func @convert_iota_3d() -> tensor<5x7x9xi32> {
 
 // CHECK-LABEL:   func @convert_avgpool_valid(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
@@ -3166,7 +3166,7 @@ func.func @convert_avgpool_valid(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8
 
 // CHECK-LABEL:   func @convert_avgpool_valid_broadcasted_divisor(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_broadcasted_divisor(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
@@ -3189,7 +3189,7 @@ func.func @convert_avgpool_valid_broadcasted_divisor(%arg0: tensor<4x16x16x8xf32
 
 // CHECK-LABEL:   func @convert_avgpool_valid_channel_first(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "VALID", strides = [1, 1, 2, 2]} : (tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "VALID", strides = [1, 1, 2, 2]}> : (tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x3x7x7xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_channel_first(%arg0: tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32> {
@@ -3211,7 +3211,7 @@ func.func @convert_avgpool_valid_channel_first(%arg0: tensor<4x3x16x16xf32>) ->
 
 // CHECK-LABEL:   func @convert_avgpool_valid_rw(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_rw(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
@@ -3243,7 +3243,7 @@ func.func @convert_avgpool_valid_rw(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x
 
 // CHECK-LABEL:   func @convert_avgpool_valid_rw_broadcasted_const_lhs(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_rw_broadcasted_const_lhs(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
@@ -3276,7 +3276,7 @@ func.func @convert_avgpool_valid_rw_broadcasted_const_lhs(%arg0: tensor<4x16x16x
 
 // CHECK-LABEL:   func @convert_avgpool_valid_3d(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool3D"(%[[VAL_0]]) {data_format = "NDHWC", ksize = [1, 3, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 2, 1]} : (tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool3D"(%[[VAL_0]]) <{data_format = "NDHWC", ksize = [1, 3, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 2, 1]}> : (tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x7x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_3d(%arg0: tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32> {
@@ -3298,7 +3298,7 @@ func.func @convert_avgpool_valid_3d(%arg0: tensor<4x16x16x16x8xf32>) -> tensor<4
 
 // CHECK-LABEL:   func @convert_avgpool_valid_3d_channel_first(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool3D"(%[[VAL_0]]) {data_format = "NCDHW", ksize = [1, 1, 3, 3, 3], padding = "VALID", strides = [1, 1, 2, 2, 2]} : (tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool3D"(%[[VAL_0]]) <{data_format = "NCDHW", ksize = [1, 1, 3, 3, 3], padding = "VALID", strides = [1, 1, 2, 2, 2]}> : (tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x3x7x7x7xf32>
 // CHECK:         }
 func.func @convert_avgpool_valid_3d_channel_first(%arg0: tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32> {
@@ -3320,7 +3320,7 @@ func.func @convert_avgpool_valid_3d_channel_first(%arg0: tensor<4x3x16x16x16xf32
 
 // CHECK-LABEL:   func @convert_avgpool_same(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x8x8x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_same(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
@@ -3352,7 +3352,7 @@ func.func @convert_avgpool_same(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8x
 
 // CHECK-LABEL:   func @convert_avgpool_reshape_broadcast(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x8x8x8xf32>
 // CHECK:         }
 func.func @convert_avgpool_reshape_broadcast(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
@@ -3376,7 +3376,7 @@ func.func @convert_avgpool_reshape_broadcast(%arg0: tensor<4x16x16x8xf32>) -> te
 
 // CHECK-LABEL:   func @convert_maxpool_valid(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool"(%[[VAL_0]]) {data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool"(%[[VAL_0]]) <{data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x8xf32>
 // CHECK:         }
 func.func @convert_maxpool_valid(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
@@ -3397,7 +3397,7 @@ func.func @convert_maxpool_valid(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8
 
 // CHECK-LABEL:   func @convert_maxpool_valid_channel_first(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32> {
-// CHECK:           %[[VAL_1:.*]]  = "tf.MaxPool"(%[[VAL_0]]) {data_format = "NCHW", explicit_paddings = [], ksize = [1, 1, 3, 3], padding = "VALID", strides = [1, 1, 2, 2]} : (tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32>
+// CHECK:           %[[VAL_1:.*]]  = "tf.MaxPool"(%[[VAL_0]]) <{data_format = "NCHW", explicit_paddings = [], ksize = [1, 1, 3, 3], padding = "VALID", strides = [1, 1, 2, 2]}> : (tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x3x7x7xf32>
 // CHECK:         }
 func.func @convert_maxpool_valid_channel_first(%arg0: tensor<4x3x16x16xf32>) -> tensor<4x3x7x7xf32> {
@@ -3418,7 +3418,7 @@ func.func @convert_maxpool_valid_channel_first(%arg0: tensor<4x3x16x16xf32>) ->
 
 // CHECK-LABEL:   func @convert_maxpool_valid_3d(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool3D"(%[[VAL_0]]) {data_format = "NDHWC", ksize = [1, 3, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 2, 1]} : (tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool3D"(%[[VAL_0]]) <{data_format = "NDHWC", ksize = [1, 3, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 2, 1]}> : (tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x7x7x7x8xf32>
 // CHECK:         }
 func.func @convert_maxpool_valid_3d(%arg0: tensor<4x16x16x16x8xf32>) -> tensor<4x7x7x7x8xf32> {
@@ -3439,7 +3439,7 @@ func.func @convert_maxpool_valid_3d(%arg0: tensor<4x16x16x16x8xf32>) -> tensor<4
 
 // CHECK-LABEL:   func @convert_maxpool_valid_3d_channel_first(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32> {
-// CHECK:           %[[VAL_1:.*]]  = "tf.MaxPool3D"(%[[VAL_0]]) {data_format = "NCDHW", ksize = [1, 1, 3, 3, 3], padding = "VALID", strides = [1, 1, 2, 2, 2]} : (tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32>
+// CHECK:           %[[VAL_1:.*]]  = "tf.MaxPool3D"(%[[VAL_0]]) <{data_format = "NCDHW", ksize = [1, 1, 3, 3, 3], padding = "VALID", strides = [1, 1, 2, 2, 2]}> : (tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x3x7x7x7xf32>
 // CHECK:         }
 func.func @convert_maxpool_valid_3d_channel_first(%arg0: tensor<4x3x16x16x16xf32>) -> tensor<4x3x7x7x7xf32> {
@@ -3460,7 +3460,7 @@ func.func @convert_maxpool_valid_3d_channel_first(%arg0: tensor<4x3x16x16x16xf32
 
 // CHECK-LABEL:   func @convert_maxpool_same(
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
-// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool"(%[[VAL_0]]) {data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
+// CHECK:           %[[VAL_1:.*]] = "tf.MaxPool"(%[[VAL_0]]) <{data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
 // CHECK:           return %[[VAL_1]] : tensor<4x8x8x8xf32>
 // CHECK:         }
 func.func @convert_maxpool_same(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
@@ -3574,8 +3574,8 @@ func.func @convert_floor_mod_int(%arg0: tensor<192x8xi32>, %arg1: tensor<192x8xi
 }
 
 // CHECK-LABEL: func @convert_floor_mod_float_cst
-// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<192x8xbf16>} : () -> tensor<192x8xbf16>
-// CHECK-DAG: %[[CST2:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<192x8xbf16>} : () -> tensor<192x8xbf16>
+// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<192x8xbf16>}> : () -> tensor<192x8xbf16>
+// CHECK-DAG: %[[CST2:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<192x8xbf16>}> : () -> tensor<192x8xbf16>
 // CHECK: %[[RESULT:.*]] = "tf.FloorMod"(%arg0, %[[CST2]]) : (tensor<192x8xbf16>, tensor<192x8xbf16>) -> tensor<192x8xbf16>
 // CHECK: return %[[RESULT]] : tensor<192x8xbf16>
 // CHECK: }
@@ -3592,8 +3592,8 @@ func.func @convert_floor_mod_float_cst(%arg0: tensor<192x8xbf16>) -> tensor<192x
 }
 
 // CHECK-LABEL: func @convert_floor_mod_int_cst
-// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() {value = dense<2> : tensor<192x8xi32>} : () -> tensor<192x8xi32>
-// CHECK-DAG: %[[CST2:.*]] = "tf.Const"() {value = dense<2> : tensor<192x8xi32>} : () -> tensor<192x8xi32>
+// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() <{value = dense<2> : tensor<192x8xi32>}> : () -> tensor<192x8xi32>
+// CHECK-DAG: %[[CST2:.*]] = "tf.Const"() <{value = dense<2> : tensor<192x8xi32>}> : () -> tensor<192x8xi32>
 // CHECK: %[[RESULT:.*]] = "tf.FloorMod"(%arg0, %[[CST2]]) : (tensor<192x8xi32>, tensor<192x8xi32>) -> tensor<192x8xi32>
 // CHECK: return %[[RESULT]] : tensor<192x8xi32>
 // CHECK: }
@@ -3649,7 +3649,7 @@ func.func @convert_floor_div(%arg0: tensor<10x10xbf16>, %arg1: tensor<10x10xbf16
 }
 
 // CHECK-LABEL: func @convert_floor_div_cst
-// CHECK: %[[CST2:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<10x10xbf16>} : () -> tensor<10x10xbf16>
+// CHECK: %[[CST2:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<10x10xbf16>}> : () -> tensor<10x10xbf16>
 // CHECK: %[[RESULT:.*]] = "tf.FloorDiv"(%arg0, %[[CST2]]) : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xbf16>
 // CHECK: return %[[RESULT]]
 // CHECK: }
@@ -3674,7 +3674,7 @@ func.func @convert_floor_div_cst(%arg0: tensor<10x10xbf16>) -> tensor<10x10xbf16
 }
 
 // CHECK-LABEL: func @convert_floor_div_cst2
-// CHECK: %[[CST2:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<10x10xbf16>} : () -> tensor<10x10xbf16>
+// CHECK: %[[CST2:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<10x10xbf16>}> : () -> tensor<10x10xbf16>
 // CHECK: %[[RESULT:.*]] = "tf.FloorDiv"(%arg0, %[[CST2]]) : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xbf16>
 // CHECK: return %[[RESULT]]
 // CHECK: }
@@ -3791,10 +3791,10 @@ func.func @convert_gather_transpose(%arg0: tensor<128x256xf32>, %arg1: tensor<4x
 // CHECK-LABEL: func @convert_gather_offset(
 // CHECK-SAME:                                      %[[VAL_0:.*]]: tensor<1x20xi32>,
 // CHECK-SAME:                                      %[[VAL_1:.*]]: tensor<1x1xi32>) -> tensor<1x1xi32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1x20xi32>, tensor<2xi64>) -> tensor<20x1xi32>
 // CHECK:           %[[VAL_4:.*]] = "tf.GatherNd"(%[[VAL_3]], %[[VAL_1]]) : (tensor<20x1xi32>, tensor<1x1xi32>) -> tensor<1x1xi32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xi32>, tensor<2xi64>) -> tensor<1x1xi32>
 // CHECK:           return %[[VAL_6]] : tensor<1x1xi32>
 // CHECK:         }
@@ -3815,12 +3815,12 @@ func.func @convert_gather_offset(%arg0: tensor<1x20xi32>, %arg1: tensor<1x1xi32>
 // CHECK-LABEL:   func @convert_gather_to_slice_batch_size_1(
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<1x2944xi32>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<1x2xi32>)
-// CHECK-DAG:         %[[CST:.*]] = "tf.Const"() {value = dense<[0, 1440]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:         %[[CST_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:         %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 1440]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:         %[[CST_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:             %[[VAL_0:.*]] = "tf.Maximum"(%[[ARG_1]], %[[CST_0:.*]]) : (tensor<1x2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
 // CHECK:             %[[VAL_1:.*]] = "tf.Minimum"(%[[VAL_0]], %[[CST]]) : (tensor<1x2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
-// CHECK-DAG:         %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 1504]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:             %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK-DAG:         %[[CST_1:.*]] = "tf.Const"() <{value = dense<[1, 1504]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK:             %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) <{squeeze_dims = [0]}> : (tensor<1x2xi32>) -> tensor<2xi32>
 // CHECK:             %[[VAL_3:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_2]], %[[CST_1]]) : (tensor<1x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
 // CHECK:            return %[[VAL_3]]
 // CHECK:         }
@@ -3878,27 +3878,27 @@ func.func @convert_gather_scalar_dynamic_indices(%arg0: tensor<256000xf32>, %arg
 // CHECK-LABEL:   func @convert_gather_to_slice(
 // CHECK-SAME:                         %[[ARG_0:.*]]: tensor<3x2944xi32>,
 // CHECK-SAME:                         %[[ARG_1:.*]]: tensor<3x2xi32>)
-// CHECK-DAG:        %[[CST:.*]] = "tf.Const"() {value = dense<[2, 1440]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:        %[[CST_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST:.*]] = "tf.Const"() <{value = dense<[2, 1440]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:            %[[VAL_0:.*]] = "tf.Maximum"(%[[ARG_1]], %[[CST_0]]) : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x2xi32>
 // CHECK:            %[[VAL_1:.*]] = "tf.Minimum"(%[[VAL_0]], %[[CST]]) : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x2xi32>
-// CHECK-DAG:        %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 1504]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:        %[[CST_2:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:        %[[CST_3:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_1:.*]] = "tf.Const"() <{value = dense<[1, 1504]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_3:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:            %[[VAL_2:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_2]], %[[CST_3]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
-// CHECK:            %[[VAL_3:.*]] = "tf.Squeeze"(%[[VAL_2]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_3:.*]] = "tf.Squeeze"(%[[VAL_2]]) <{squeeze_dims = [0]}> : (tensor<1x2xi32>) -> tensor<2xi32>
 // CHECK:            %[[VAL_4:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_3]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
-// CHECK-DAG:        %[[CST_4:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:        %[[CST_5:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_4:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_5:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:            %[[VAL_5:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_4]], %[[CST_5]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
-// CHECK:            %[[VAL_6:.*]] = "tf.Squeeze"(%[[VAL_5]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_6:.*]] = "tf.Squeeze"(%[[VAL_5]]) <{squeeze_dims = [0]}> : (tensor<1x2xi32>) -> tensor<2xi32>
 // CHECK:            %[[VAL_7:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_6]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
-// CHECK-DAG:        %[[CST_6:.*]] = "tf.Const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:        %[[CST_7:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_6:.*]] = "tf.Const"() <{value = dense<[2, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_7:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:            %[[VAL_8:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_6]], %[[CST_7]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
-// CHECK:            %[[VAL_9:.*]] = "tf.Squeeze"(%[[VAL_8]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_9:.*]] = "tf.Squeeze"(%[[VAL_8]]) <{squeeze_dims = [0]}> : (tensor<1x2xi32>) -> tensor<2xi32>
 // CHECK:            %[[VAL_10:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_9]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
-// CHECK-DAG:        %[[CST_8:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:        %[[CST_8:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:            %[[VAL_11:.*]] = "tf.ConcatV2"(%[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[CST_8]]) : (tensor<1x1504xi32>, tensor<1x1504xi32>, tensor<1x1504xi32>, tensor<i32>) -> tensor<3x1504xi32>
 // CHECK:            return %[[VAL_11]]
 // CHECK:         }
@@ -3936,17 +3936,17 @@ func.func @convert_gather_to_slice_dynamic_error(%arg0: tensor<3x?xi32>, %arg1:
 // CHECK-SAME:                                      %[[VAL_0:.*]]: tensor<7x3xf32>,
 // CHECK-SAME:                                      %[[VAL_1:.*]]: tensor<i32>,
 // CHECK-SAME:                                      %[[VAL_2:.*]]: tensor<i32>) -> tensor<4x2xf32> {
-// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Cast"(%[[VAL_1]]) {Truncate = false} : (tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Cast"(%[[VAL_1]]) <{Truncate = false}> : (tensor<i32>) -> tensor<i32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Minimum"(%[[VAL_4]], %[[VAL_5]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:           %[[VAL_7:.*]] = "tf.Maximum"(%[[VAL_6]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_8:.*]] = "tf.Cast"(%[[VAL_2]]) {Truncate = false} : (tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_9:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Cast"(%[[VAL_2]]) <{Truncate = false}> : (tensor<i32>) -> tensor<i32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_10:.*]] = "tf.Minimum"(%[[VAL_8]], %[[VAL_9]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:           %[[VAL_11:.*]] = "tf.Maximum"(%[[VAL_10]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_12:.*]] = "tf.Pack"(%[[VAL_7]], %[[VAL_11]]) {axis = 0 : i64} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:           %[[VAL_13:.*]] = "tf.Const"() {value = dense<[4, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_12:.*]] = "tf.Pack"(%[[VAL_7]], %[[VAL_11]]) <{axis = 0 : i64}> : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Const"() <{value = dense<[4, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Slice"(%[[VAL_0]], %[[VAL_12]], %[[VAL_13]]) : (tensor<7x3xf32>, tensor<2xi32>, tensor<2xi64>) -> tensor<4x2xf32>
 // CHECK:           return %[[VAL_14]] : tensor<4x2xf32>
 // CHECK:         }
@@ -3959,17 +3959,17 @@ func.func @convert_dynamic_slice(%arg0: tensor<7x3xf32>, %arg1: tensor<i32>, %ar
 // CHECK-SAME:                                           %[[VAL_0:.*]]: tensor<7x3xf32>,
 // CHECK-SAME:                                           %[[VAL_1:.*]]: tensor<ui32>,
 // CHECK-SAME:                                           %[[VAL_2:.*]]: tensor<ui32>) -> tensor<4x2xf32> {
-// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[VAL_4:.*]] = "tf.Cast"(%[[VAL_1]]) {Truncate = false} : (tensor<ui32>) -> tensor<i32>
-// CHECK:           %[[VAL_5:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Cast"(%[[VAL_1]]) <{Truncate = false}> : (tensor<ui32>) -> tensor<i32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Minimum"(%[[VAL_4]], %[[VAL_5]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:           %[[VAL_7:.*]] = "tf.Maximum"(%[[VAL_6]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_8:.*]] = "tf.Cast"(%[[VAL_2]]) {Truncate = false} : (tensor<ui32>) -> tensor<i32>
-// CHECK:           %[[VAL_9:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Cast"(%[[VAL_2]]) <{Truncate = false}> : (tensor<ui32>) -> tensor<i32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_10:.*]] = "tf.Minimum"(%[[VAL_8]], %[[VAL_9]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:           %[[VAL_11:.*]] = "tf.Maximum"(%[[VAL_10]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:           %[[VAL_12:.*]] = "tf.Pack"(%[[VAL_7]], %[[VAL_11]]) {axis = 0 : i64} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:           %[[VAL_13:.*]] = "tf.Const"() {value = dense<[4, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_12:.*]] = "tf.Pack"(%[[VAL_7]], %[[VAL_11]]) <{axis = 0 : i64}> : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Const"() <{value = dense<[4, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK:           %[[VAL_14:.*]] = "tf.Slice"(%[[VAL_0]], %[[VAL_12]], %[[VAL_13]]) : (tensor<7x3xf32>, tensor<2xi32>, tensor<2xi64>) -> tensor<4x2xf32>
 // CHECK:           return %[[VAL_14]] : tensor<4x2xf32>
 // CHECK:         }
@@ -4206,7 +4206,7 @@ func.func @convert_scatter_sub(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>,
 // CHECK-LABEL:   func @convert_argmax(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
 // CHECK:           %[[VAL_9:.*]] = "tf.Const"{{.*}}value = dense<2> : tensor<1xi64>
-// CHECK:           %[[VAL_10:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_9]]) {keep_dims = false} : {{.*}} -> tensor<4x32xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_9]]) <{keep_dims = false}> : {{.*}} -> tensor<4x32xf32>
 // CHECK:           %[[VAL_11:.*]] = "tf.ArgMax"(%[[VAL_0]], %[[VAL_9]]) : {{.*}} -> tensor<4x32xi32>
 // CHECK:           return %[[VAL_10]], %[[VAL_11]]
 // CHECK:         }
@@ -4233,11 +4233,11 @@ func.func @convert_argmax(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 
 // CHECK-LABEL: func @convert_argmax_constant(
 // CHECK-SAME:                                        %[[VAL_0:.*]]: tensor<2x2x4xf32>) -> (tensor<2x2xf32>, tensor<2x2xi32>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0xFF800000> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>} : () -> tensor<2x2x4xi32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_5:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_4]]) {keep_dims = false} : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0xFF800000> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>}> : () -> tensor<2x2x4xi32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_4]]) <{keep_dims = false}> : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.ArgMax"(%[[VAL_0]], %[[VAL_4]]) : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xi32>
 // CHECK:           return %[[VAL_5]], %[[VAL_6]] : tensor<2x2xf32>, tensor<2x2xi32>
 // CHECK:         }
@@ -4263,11 +4263,11 @@ func.func @convert_argmax_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
 
 // CHECK-LABEL:   func @convert_argmax_constant_non_z_axis(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x4xf32>) -> (tensor<4xf32>, tensor<4xi32>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0xFF800000> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]> : tensor<4x4xi32>} : () -> tensor<4x4xi32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_5:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_4]]) {keep_dims = false} : (tensor<4x4xf32>, tensor<1xi64>) -> tensor<4xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0xFF800000> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]> : tensor<4x4xi32>}> : () -> tensor<4x4xi32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_4]]) <{keep_dims = false}> : (tensor<4x4xf32>, tensor<1xi64>) -> tensor<4xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.ArgMax"(%[[VAL_0]], %[[VAL_4]]) : (tensor<4x4xf32>, tensor<1xi64>) -> tensor<4xi32>
 // CHECK:           return %[[VAL_5]], %[[VAL_6]] : tensor<4xf32>, tensor<4xi32>
 // CHECK:         }
@@ -4293,14 +4293,14 @@ func.func @convert_argmax_constant_non_z_axis(%arg0: tensor<4x4xf32>) -> (tensor
 
 // CHECK-LABEL:   func.func @convert_argmax_bool(
 // CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<2xi1>) -> tensor<i32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_4:.*]] = "tf.Range"(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
-// CHECK-DAG:       %[[VAL_6:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_7:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_8:.*]] = "tf.Any"(%[[VAL_0]], %[[VAL_7]]) {keep_dims = false} : (tensor<2xi1>, tensor<1xi64>) -> tensor<i1>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
+// CHECK-DAG:       %[[VAL_6:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_7:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_8:.*]] = "tf.Any"(%[[VAL_0]], %[[VAL_7]]) <{keep_dims = false}> : (tensor<2xi1>, tensor<1xi64>) -> tensor<i1>
 // CHECK:           %[[VAL_9:.*]] = "tf.ArgMax"(%[[VAL_0]], %[[VAL_7]]) : (tensor<2xi1>, tensor<1xi64>) -> tensor<i32>
 // CHECK:           return %[[VAL_9]] : tensor<i32>
 // CHECK:         }
@@ -4326,7 +4326,7 @@ func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 // CHECK-LABEL:   func @convert_argmin(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
 // CHECK:           %[[VAL_9:.*]] = "tf.Const"{{.*}}value = dense<2> : tensor<1xi64>
-// CHECK:           %[[VAL_10:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_9]]) {keep_dims = false} : {{.*}} -> tensor<4x32xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_9]]) <{keep_dims = false}> : {{.*}} -> tensor<4x32xf32>
 // CHECK:           %[[VAL_11:.*]] = "tf.ArgMin"(%[[VAL_0]], %[[VAL_9]]) : {{.*}} -> tensor<4x32xi32>
 // CHECK:           return %[[VAL_10]], %[[VAL_11]]
 // CHECK:         }
@@ -4354,7 +4354,7 @@ func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 // CHECK-LABEL:   func @convert_argmin_i16(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<2xi16>) -> (tensor<i16>, tensor<i32>) {
 // CHECK:           %[[VAL_9:.*]] = "tf.Const"{{.*}}value = dense<0> : tensor<1xi64>
-// CHECK:           %[[VAL_10:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_9]]) {keep_dims = false} : {{.*}} -> tensor<i16>
+// CHECK:           %[[VAL_10:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_9]]) <{keep_dims = false}> : {{.*}} -> tensor<i16>
 // CHECK:           %[[VAL_11:.*]] = "tf.ArgMin"(%[[VAL_0]], %[[VAL_9]]) : {{.*}} -> tensor<i32>
 // CHECK:           return %[[VAL_10]], %[[VAL_11]]
 // CHECK:         }
@@ -4381,11 +4381,11 @@ func.func @convert_argmin_i16(%arg0: tensor<2xi16>) -> (tensor<i16>, tensor<i32>
 
 // CHECK-LABEL: func @convert_argmin_constant(
 // CHECK-SAME:                                        %[[VAL_0:.*]]: tensor<2x2x4xf32>) -> (tensor<2x2xf32>, tensor<2x2xi32>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0x7F800000> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>} : () -> tensor<2x2x4xi32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_5:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_4]]) {keep_dims = false} : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0x7F800000> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>}> : () -> tensor<2x2x4xi32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_4]]) <{keep_dims = false}> : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.ArgMin"(%[[VAL_0]], %[[VAL_4]]) : (tensor<2x2x4xf32>, tensor<1xi64>) -> tensor<2x2xi32>
 // CHECK:           return %[[VAL_5]], %[[VAL_6]] : tensor<2x2xf32>, tensor<2x2xi32>
 // CHECK:         }
@@ -4411,14 +4411,14 @@ func.func @convert_argmin_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
 
 // CHECK-LABEL:   func.func @convert_argmin_bool(
 // CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<2xi1>) -> tensor<i32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_4:.*]] = "tf.Range"(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
-// CHECK-DAG:       %[[VAL_6:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_7:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_8:.*]] = "tf.All"(%[[VAL_0]], %[[VAL_7]]) {keep_dims = false} : (tensor<2xi1>, tensor<1xi64>) -> tensor<i1>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
+// CHECK-DAG:       %[[VAL_6:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_7:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_8:.*]] = "tf.All"(%[[VAL_0]], %[[VAL_7]]) <{keep_dims = false}> : (tensor<2xi1>, tensor<1xi64>) -> tensor<i1>
 // CHECK:           %[[VAL_9:.*]] = "tf.ArgMin"(%[[VAL_0]], %[[VAL_7]]) : (tensor<2xi1>, tensor<1xi64>) -> tensor<i32>
 // CHECK:           return %[[VAL_9]] : tensor<i32>
 // CHECK:         }
@@ -4442,16 +4442,16 @@ func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 
 // CHECK-LABEL:   func @convert_argmax_with_reshaped_iota(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<1x32x1xf32>) -> (tensor<1x1xf32>, tensor<1x1xi32>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0xFF800000> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<32> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0xFF800000> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() <{value = dense<32> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Range"(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<32xi32>
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant dense<[1, 32, 1]> : tensor<3xi64>
 // CHECK:           %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_6]], %[[VAL_7]]) : (tensor<32xi32>, tensor<3xi64>) -> tensor<1x32x1xi32>
-// CHECK-DAG:       %[[VAL_9:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           %[[VAL_10:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_9]]) {keep_dims = false} : (tensor<1x32x1xf32>, tensor<1xi64>) -> tensor<1x1xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:           %[[VAL_10:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_9]]) <{keep_dims = false}> : (tensor<1x32x1xf32>, tensor<1xi64>) -> tensor<1x1xf32>
 // CHECK:           %[[VAL_11:.*]] = "tf.ArgMax"(%[[VAL_0]], %[[VAL_9]]) : (tensor<1x32x1xf32>, tensor<1xi64>) -> tensor<1x1xi32>
 // CHECK:           return %[[VAL_10]], %[[VAL_11]] : tensor<1x1xf32>, tensor<1x1xi32>
 // CHECK:         }
@@ -4488,7 +4488,7 @@ func.func @convert_not(%arg0: tensor<5x3x1xi1>) -> tensor<5x3x1xi1> {
 
 // CHECK-LABEL:   func @convert_not_i8(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi8>) -> tensor<7x9x11xi8> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i8>} : () -> tensor<i8>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i8>}> : () -> tensor<i8>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi8>, tensor<i8>) -> tensor<7x9x11xi8>
 // CHECK:           return %[[RES]] : tensor<7x9x11xi8>
 // CHECK:         }
@@ -4499,7 +4499,7 @@ func.func @convert_not_i8(%arg0: tensor<7x9x11xi8>) -> tensor<7x9x11xi8> {
 
 // CHECK-LABEL:   func @convert_not_i16(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi16>) -> tensor<7x9x11xi16> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i16>} : () -> tensor<i16>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i16>}> : () -> tensor<i16>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi16>, tensor<i16>) -> tensor<7x9x11xi16>
 // CHECK:           return %[[RES]] : tensor<7x9x11xi16>
 // CHECK:         }
@@ -4510,7 +4510,7 @@ func.func @convert_not_i16(%arg0: tensor<7x9x11xi16>) -> tensor<7x9x11xi16> {
 
 // CHECK-LABEL:   func @convert_not_i32(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi32>) -> tensor<7x9x11xi32> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi32>, tensor<i32>) -> tensor<7x9x11xi32>
 // CHECK:           return %[[RES]] : tensor<7x9x11xi32>
 // CHECK:         }
@@ -4521,7 +4521,7 @@ func.func @convert_not_i32(%arg0: tensor<7x9x11xi32>) -> tensor<7x9x11xi32> {
 
 // CHECK-LABEL:   func @convert_not_i64(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi64>) -> tensor<7x9x11xi64> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i64>}> : () -> tensor<i64>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi64>, tensor<i64>) -> tensor<7x9x11xi64>
 // CHECK:           return %[[RES]] : tensor<7x9x11xi64>
 // CHECK:         }
@@ -4532,7 +4532,7 @@ func.func @convert_not_i64(%arg0: tensor<7x9x11xi64>) -> tensor<7x9x11xi64> {
 
 // CHECK-LABEL:   func @convert_not_ui8(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui8>) -> tensor<7x9x11xui8> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<255> : tensor<ui8>} : () -> tensor<ui8>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<255> : tensor<ui8>}> : () -> tensor<ui8>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui8>, tensor<ui8>) -> tensor<7x9x11xui8>
 // CHECK:           return %[[RES]] : tensor<7x9x11xui8>
 // CHECK:         }
@@ -4543,7 +4543,7 @@ func.func @convert_not_ui8(%arg0: tensor<7x9x11xui8>) -> tensor<7x9x11xui8> {
 
 // CHECK-LABEL:   func @convert_not_ui16(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui16>) -> tensor<7x9x11xui16> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<65535> : tensor<ui16>} : () -> tensor<ui16>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<65535> : tensor<ui16>}> : () -> tensor<ui16>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui16>, tensor<ui16>) -> tensor<7x9x11xui16>
 // CHECK:           return %[[RES]] : tensor<7x9x11xui16>
 // CHECK:         }
@@ -4554,7 +4554,7 @@ func.func @convert_not_ui16(%arg0: tensor<7x9x11xui16>) -> tensor<7x9x11xui16> {
 
 // CHECK-LABEL:   func @convert_not_ui32(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui32>) -> tensor<7x9x11xui32> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<4294967295> : tensor<ui32>} : () -> tensor<ui32>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<4294967295> : tensor<ui32>}> : () -> tensor<ui32>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui32>, tensor<ui32>) -> tensor<7x9x11xui32>
 // CHECK:           return %[[RES]] : tensor<7x9x11xui32>
 // CHECK:         }
@@ -4565,7 +4565,7 @@ func.func @convert_not_ui32(%arg0: tensor<7x9x11xui32>) -> tensor<7x9x11xui32> {
 
 // CHECK-LABEL:   func @convert_not_ui64(
 // CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui64>) -> tensor<7x9x11xui64> {
-// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<18446744073709551615> : tensor<ui64>} : () -> tensor<ui64>
+// CHECK:           %[[CST:.*]] = "tf.Const"() <{value = dense<18446744073709551615> : tensor<ui64>}> : () -> tensor<ui64>
 // CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui64>, tensor<ui64>) -> tensor<7x9x11xui64>
 // CHECK:           return %[[RES]] : tensor<7x9x11xui64>
 // CHECK:         }
@@ -4580,7 +4580,7 @@ func.func @convert_not_ui64(%arg0: tensor<7x9x11xui64>) -> tensor<7x9x11xui64> {
 // CHECK-DAG:      %[[CST_0:.*]] = arith.constant dense<1> : tensor<i32>
 // CHECK-DAG:      %[[CST_1:.*]] = arith.constant dense<0> : tensor<i32>
 // CHECK-DAG:      %[[CST_2:.*]] = arith.constant dense<1000> : tensor<i32>
-// CHECK:          %[[WHILEREGION_0:.*]]:3 = "tf.WhileRegion"(%[[CST_1]], %[[CST_0]], %[[CST_2]]) ({
+// CHECK:          %[[WHILEREGION_0:.*]]:3 = "tf.WhileRegion"(%[[CST_1]], %[[CST_0]], %[[CST_2]]) <{is_stateless = false, parallel_iterations = 10 : i64}> ({
 // CHECK:          ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>):
 // CHECK:            %[[LESS_0:.*]] = "tf.Less"(%arg0, %arg2) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:            "tf.Yield"(%[[LESS_0]]) : (tensor<i1>) -> ()
@@ -4588,7 +4588,7 @@ func.func @convert_not_ui64(%arg0: tensor<7x9x11xui64>) -> tensor<7x9x11xui64> {
 // CHECK:          ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>):
 // CHECK:            %[[ADDV2_0:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:            "tf.Yield"(%[[ADDV2_0]], %arg1, %arg2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
-// CHECK:          }) {is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+// CHECK:          }) : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
 // CHECK:          return %[[WHILEREGION_0]]#0, %[[WHILEREGION_0]]#1, %[[WHILEREGION_0]]#2 : tensor<i32>, tensor<i32>, tensor<i32>
 // CHECK:        }
 func.func @while_with_variadic() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
@@ -4613,19 +4613,19 @@ func.func @while_with_variadic() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
 // CHECK-DAG:      %[[CST_0:.*]] = arith.constant dense<1> : tensor<i32>
 // CHECK-DAG:      %[[CST_1:.*]] = arith.constant dense<0> : tensor<i32>
 // CHECK-DAG:      %[[CST_2:.*]] = arith.constant dense<1000> : tensor<i32>
-// CHECK:          %[[WHILEREGION_0:.*]]:5 = "tf.WhileRegion"(%[[CST_1]], %[[CST_0]], %[[CST_2]], %arg0, %arg1) ({
+// CHECK:          %[[WHILEREGION_0:.*]]:5 = "tf.WhileRegion"(%[[CST_1]], %[[CST_0]], %[[CST_2]], %arg0, %arg1) <{is_stateless = false, parallel_iterations = 10 : i64}> ({
 // CHECK:          ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<1x256xf32>, %arg6: tensor<1xf32>):
 // CHECK:            %[[LESS_0:.*]] = "tf.Less"(%arg2, %arg4) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:            "tf.Yield"(%[[LESS_0]]) : (tensor<i1>) -> ()
 // CHECK:          },  {
 // CHECK:          ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<1x256xf32>, %arg6: tensor<1xf32>):
 // CHECK:            %[[ADDV2_0:.*]] = "tf.AddV2"(%arg2, %arg3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK-DAG:        %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:        %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:            %[[SUM_0:.*]] = "tf.Sum"(%arg5, %[[CONST_1]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK-DAG:        %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:        %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK:            %[[SUM_0:.*]] = "tf.Sum"(%arg5, %[[CONST_1]]) <{keep_dims = false}> : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:            %[[ADDV2_1:.*]] = "tf.AddV2"(%[[SUM_0]], %arg6) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK:            "tf.Yield"(%[[ADDV2_0]], %arg3, %arg4, %arg5, %[[ADDV2_1]]) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x256xf32>, tensor<1xf32>) -> ()
-// CHECK:          }) {is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x256xf32>, tensor<1xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x256xf32>, tensor<1xf32>)
+// CHECK:          }) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x256xf32>, tensor<1xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x256xf32>, tensor<1xf32>)
 // CHECK:          return %[[WHILEREGION_0]]#0, %[[WHILEREGION_0]]#1, %[[WHILEREGION_0]]#2, %[[WHILEREGION_0]]#4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<1xf32>
 // CHECK:        }
 func.func @while_with_reduce(%arg0: tensor<1x256xf32>, %arg1: tensor<1xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1xf32>) {
@@ -4656,11 +4656,11 @@ func.func @while_with_reduce(%arg0: tensor<1x256xf32>, %arg1: tensor<1xf32>) ->
 // CHECK-LABEL:  func @if
 // CHECK-DAG:      %[[CST_0:.*]] = arith.constant dense<0> : tensor<i32>
 // CHECK-DAG:      %[[CST_1:.*]] = arith.constant dense<1000> : tensor<i32>
-// CHECK:          %[[RES:.*]]  = "tf.IfRegion"(%arg0) ({
+// CHECK:          %[[RES:.*]]  = "tf.IfRegion"(%arg0) <{is_stateless = false}> ({
 // CHECK:            "tf.Yield"(%[[CST_0]]) : (tensor<i32>) -> ()
 // CHECK:          }, {
 // CHECK:            "tf.Yield"(%[[CST_1]]) : (tensor<i32>) -> ()
-// CHECK:          }) {is_stateless = false} : (tensor<i1>) -> tensor<i32>
+// CHECK:          }) : (tensor<i1>) -> tensor<i32>
 // CHECK:          return %[[RES]]
 func.func @if(%arg0: tensor<i1>) -> (tensor<i32>) {
   %cst_0 = arith.constant dense<0> : tensor<i32>
@@ -4679,7 +4679,7 @@ func.func @if(%arg0: tensor<i1>) -> (tensor<i32>) {
 // CHECK-SAME:                                       %[[VAL_2:[a-z0-9]*]]: tensor<i32>,
 // CHECK-SAME:                                       %[[VAL_3:[a-z0-9]*]]: tensor<i32>,
 // CHECK-SAME:                                       %[[VAL_4:[a-z0-9]*]]: tensor<i32>) -> tensor<28x1x100xf32> {
-// CHECK:         %0 = "tf.Pack"(%arg2, %arg3, %arg4) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+// CHECK:         %0 = "tf.Pack"(%arg2, %arg3, %arg4) <{axis = 0 : i64}> : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
 // CHECK:         %1 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %0) : (tensor<28x1x100xf32>, tensor<1x1x100xf32>, tensor<3xi32>) -> tensor<28x1x100xf32>
 // CHECK:         return %1 : tensor<28x1x100xf32>
 func.func @convert_dynamic_update_slice(%arg0: tensor<28x1x100xf32>, %arg1: tensor<1x1x100xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>) -> tensor<28x1x100xf32> {
@@ -4692,7 +4692,7 @@ func.func @convert_dynamic_update_slice(%arg0: tensor<28x1x100xf32>, %arg1: tens
 // CHECK-SAME:                                       %arg1: tensor<?x2xi32>,
 // CHECK-SAME:                                       %arg2: tensor<i32>,
 // CHECK-SAME:                                       %arg3: tensor<i32>) -> tensor<?x4xi32> {
-// CHECK:         %0 = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:         %0 = "tf.Pack"(%arg2, %arg3) <{axis = 0 : i64}> : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
 // CHECK:         %1 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %0) : (tensor<?x4xi32>, tensor<?x2xi32>, tensor<2xi32>) -> tensor<?x4xi32>
 // CHECK:         return %1 : tensor<?x4xi32>
 // CHECK:         }
@@ -4707,7 +4707,7 @@ func.func @dynamic_update_slice_inputs_have_dynamic_dim(%arg0: tensor<?x4xi32>,
 // CHECK-SAME:                                       %arg2: tensor<i32>,
 // CHECK-SAME:                                       %arg3: tensor<i32>,
 // CHECK-SAME:                                       %arg4: tensor<i32>) -> tensor<1x?x256xf32> {
-// CHECK:         %0 = "tf.Pack"(%arg2, %arg3, %arg4) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+// CHECK:         %0 = "tf.Pack"(%arg2, %arg3, %arg4) <{axis = 0 : i64}> : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
 // CHECK:         %1 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %0) : (tensor<1x?x256xf32>, tensor<1x1x256xf32>, tensor<3xi32>) -> tensor<1x?x256xf32>
 // CHECK:         return %1 : tensor<1x?x256xf32>
 // CHECK:         }
@@ -4719,9 +4719,9 @@ func.func @dynamic_update_slice_operand_has_dynamic_dim(%arg0: tensor<1x?x256xf3
 // CHECK-LABEL:   func @convert_reduce_to_all(
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x2x3x4x5xi1>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
-// CHECK-DAG:       %[[TRUE_CST:.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
-// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %[[VAL_0:.*]] = "tf.All"(%[[ARG_0]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK-DAG:       %[[TRUE_CST:.*]] = "tf.Const"() <{value = dense<true> : tensor<i1>}> : () -> tensor<i1>
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.All"(%[[ARG_0]], %[[DIMENSIONS]]) <{keep_dims = false}> : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
 // CHECK:           return %[[VAL_0:.*]] : tensor<2x4x5xi1>
 // CHECK:         }
 func.func @convert_reduce_to_all(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi64>) -> tensor<2x4x5xi1> {
@@ -4738,8 +4738,8 @@ func.func @convert_reduce_to_all(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi6
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<i1>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<1x2x3x4x5xi1>,
 // CHECK-SAME:                                %[[ARG_2:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
-// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %[[VAL_0:.*]] = "tf.All"(%[[ARG_1]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.All"(%[[ARG_1]], %[[DIMENSIONS]]) <{keep_dims = false}> : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
 // CHECK:           %[[VAL_1:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[ARG_0]]) : (tensor<2x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
 // CHECK:           return %[[VAL_1:.*]] : tensor<2x4x5xi1>
 // CHECK:         }
@@ -4755,9 +4755,9 @@ func.func @convert_reduce_to_all_non_constant_init(%arg0: tensor<i1>, %arg1: ten
 // CHECK-LABEL:   func @convert_reduce_to_any(
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x2x3x4x5xi1>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
-// CHECK-DAG:       %[[FALSE_CST:.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
-// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %[[VAL_0:.*]] = "tf.Any"(%[[ARG_0]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK-DAG:       %[[FALSE_CST:.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.Any"(%[[ARG_0]], %[[DIMENSIONS]]) <{keep_dims = false}> : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
 // CHECK:           return %[[VAL_0:.*]] : tensor<2x4x5xi1>
 // CHECK:         }
 func.func @convert_reduce_to_any(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi64>) -> tensor<2x4x5xi1> {
@@ -4774,8 +4774,8 @@ func.func @convert_reduce_to_any(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi6
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<i1>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<1x2x3x4x5xi1>,
 // CHECK-SAME:                                %[[ARG_2:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
-// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           %[[VAL_0:.*]] = "tf.Any"(%[[ARG_1]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.Any"(%[[ARG_1]], %[[DIMENSIONS]]) <{keep_dims = false}> : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
 // CHECK:           %[[VAL_1:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[ARG_0]]) : (tensor<2x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
 // CHECK:           return %[[VAL_1:.*]] : tensor<2x4x5xi1>
 // CHECK:         }
@@ -4790,14 +4790,14 @@ func.func @convert_reduce_to_any_non_constant_init(%arg0: tensor<i1>, %arg1: ten
 
 // CHECK-LABEL:   func @convert_sort_to_topk_iota_broadcast(
 // CHECK-SAME:                                              %[[ARG_0:.*]]: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Range"(%cst, %cst_0, %cst_1) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<6xi32>
 // CHECK:           %[[VAL_4:.*]] = arith.constant dense<[3, 6]> : tensor<2xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.BroadcastTo"(%0, %cst_2) : (tensor<6xi32>, tensor<2xi64>) -> tensor<3x6xi32>
-// CHECK:           %[[K:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) {sorted = true} : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
+// CHECK:           %[[K:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) <{sorted = true}> : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
 // CHECK:           return %[[VALUES]], %[[INDICES]] : tensor<3x6xf32>, tensor<3x6xi32>
 // CHECK:         }
 func.func @convert_sort_to_topk_iota_broadcast(%arg0: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
@@ -4813,11 +4813,11 @@ func.func @convert_sort_to_topk_iota_broadcast(%arg0: tensor<3x6xf32>) -> (tenso
 
 // CHECK-LABEL:   func @convert_sort_to_topk_iotacst_broadcast(
 // CHECK-SAME:                                                 %[[ARG_0:.*]]: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 3, 4, 5]> : tensor<6xi32>} : () -> tensor<6xi32>
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<[0, 1, 2, 3, 4, 5]> : tensor<6xi32>}> : () -> tensor<6xi32>
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant dense<[3, 6]> : tensor<2xi64>
 // CHECK:           %[[VAL_2:.*]] = "tf.BroadcastTo"(%cst, %cst_0) : (tensor<6xi32>, tensor<2xi64>) -> tensor<3x6xi32>
-// CHECK:           %[[K:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) {sorted = true} : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
+// CHECK:           %[[K:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) <{sorted = true}> : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
 // CHECK:           return %[[VALUES]], %[[INDICES]] : tensor<3x6xf32>, tensor<3x6xi32>
 // CHECK:         }
 func.func @convert_sort_to_topk_iotacst_broadcast(%arg0: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
@@ -4833,9 +4833,9 @@ func.func @convert_sort_to_topk_iotacst_broadcast(%arg0: tensor<3x6xf32>) -> (te
 
 // CHECK-LABEL:   func @convert_sort_to_topk_const(
 // CHECK-SAME:                                     %[[ARG_0:.*]]: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
-// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<3x6xi32>} : () -> tensor<3x6xi32>
-// CHECK-DAG:       %[[K:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) {sorted = true} : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<3x6xi32>}> : () -> tensor<3x6xi32>
+// CHECK-DAG:       %[[K:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:           %[[VALUES:.*]], %[[INDICES:.*]] = "tf.TopKV2"(%[[ARG_0]], %[[K]]) <{sorted = true}> : (tensor<3x6xf32>, tensor<i32>) -> (tensor<3x6xf32>, tensor<3x6xi32>)
 // CHECK:           return %[[VALUES]], %[[INDICES]] : tensor<3x6xf32>, tensor<3x6xi32>
 // CHECK:         }
 func.func @convert_sort_to_topk_const(%arg0: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
@@ -4913,7 +4913,7 @@ func.func @not_convert_remainder_for_uint64(%arg0: tensor<10x8xui64>, %arg1: ten
 // CHECK-LABEL:   func @convert_population_count_i32(
 // CHECK-SAME:                                   %[[ARG_0:.*]]: tensor<8xi32>
 // CHECK:       %[[POP_CNT:.*]] = "tf.PopulationCount"(%[[ARG_0]]) : (tensor<8xi32>) -> tensor<8xui8>
-// CHECK:       %[[RES:.*]] = "tf.Cast"(%[[POP_CNT]]) {Truncate = false} : (tensor<8xui8>) -> tensor<8xi32>
+// CHECK:       %[[RES:.*]] = "tf.Cast"(%[[POP_CNT]]) <{Truncate = false}> : (tensor<8xui8>) -> tensor<8xi32>
 // CHECK:       return %[[RES]]
 // CHECK:         }
 func.func @convert_population_count_i32(%arg0: tensor<8xi32>) -> tensor<8xi32> {
@@ -4932,8 +4932,8 @@ func.func @convert_population_count_ui8(%arg0: tensor<8xui8>) -> tensor<8xui8> {
 }
 
 // CHECK-LABEL:   func @torch_index_select(
-// CHECK:       %[[AXIS:.+]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-// CHECK:       %[[RES:.+]] = "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) {batch_dims = 0 : i64}
+// CHECK:       %[[AXIS:.+]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
+// CHECK:       %[[RES:.+]] = "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) <{batch_dims = 0 : i64}>
 // CHECK:       return %[[RES]]
 
 func.func @torch_index_select(%arg0: tensor<2x1xf32>, %arg1: tensor<2xi32>) -> tensor<2x1xf32> {
@@ -4945,9 +4945,9 @@ func.func @torch_index_select(%arg0: tensor<2x1xf32>, %arg1: tensor<2xi32>) -> t
 
 // CHECK-LABEL:   func @lowered_cumsum(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x12xf32>) -> tensor<4x12xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Cumsum"(%[[VAL_0]], %[[VAL_2]]) {exclusive = false, reverse = false} : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Cumsum"(%[[VAL_0]], %[[VAL_2]]) <{exclusive = false, reverse = false}> : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
 // CHECK:           return %[[VAL_3]] : tensor<4x12xf32>
 // CHECK:         }
 func.func @lowered_cumsum(%arg0: tensor<4x12xf32>) -> tensor<4x12xf32> {
@@ -4962,9 +4962,9 @@ func.func @lowered_cumsum(%arg0: tensor<4x12xf32>) -> tensor<4x12xf32> {
 
 // CHECK-LABEL:   func @lowered_cumsum_trivial_attrs(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x12xf32>) -> tensor<4x12xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Cumsum"(%[[VAL_0]], %[[VAL_2]]) {exclusive = false, reverse = false} : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Cumsum"(%[[VAL_0]], %[[VAL_2]]) <{exclusive = false, reverse = false}> : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
 // CHECK:           return %[[VAL_3]] : tensor<4x12xf32>
 // CHECK:         }
 func.func @lowered_cumsum_trivial_attrs(%arg0: tensor<4x12xf32>) -> tensor<4x12xf32> {
@@ -4979,9 +4979,9 @@ func.func @lowered_cumsum_trivial_attrs(%arg0: tensor<4x12xf32>) -> tensor<4x12x
 
 // CHECK-LABEL:   func @lowered_cumprod(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x12xf32>) -> tensor<4x12xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           %[[VAL_3:.*]] = "tf.Cumprod"(%[[VAL_0]], %[[VAL_2]]) {exclusive = false, reverse = false} : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Cumprod"(%[[VAL_0]], %[[VAL_2]]) <{exclusive = false, reverse = false}> : (tensor<4x12xf32>, tensor<i64>) -> tensor<4x12xf32>
 // CHECK:           return %[[VAL_3]] : tensor<4x12xf32>
 // CHECK:         }
 func.func @lowered_cumprod(%arg0: tensor<4x12xf32>) -> tensor<4x12xf32> {
@@ -5022,7 +5022,7 @@ func.func @convert_dot_quant_type(%arg0: tensor<1x256xf32>, %arg1: tensor<256x!q
 
 // CHECK-LABEL: func @get_dimension_size(
 // CHECK-SAME:              %[[ARG_0:.*]]: tensor<4x256x?xf32>) -> tensor<i32> {
-// CHECK          %[[CST_0:.*]] = "tf.Const"() {value = dense<256> : tensor<i32>} : () -> tensor<i32>
+// CHECK          %[[CST_0:.*]] = "tf.Const"() <{value = dense<256> : tensor<i32>}> : () -> tensor<i32>
 // CHECK          return %[[CST_0]] : tensor<i32>
 func.func @get_dimension_size(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
   %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<4x256x?xf32>) -> tensor<i32>
@@ -5032,10 +5032,10 @@ func.func @get_dimension_size(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
 // CHECK-LABEL: func @get_dimension_size_dynamic(
 // CHECK-SAME:              %[[ARG_0:.*]]: tensor<4x256x?xf32>) -> tensor<i32> {
 // CHECK          %[[VAL_0:.*]] = "tf.Shape"(%[[ARG_0]]) : (tensor<4x256x?xf32>) -> tensor<3xi32>
-// CHECK          %[[CST_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK          %[[CST_1:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK          %[[CST_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK          %[[CST_1:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
 // CHECK          %[[VAL_1:.*]] = "tf.Slice"(%[[VAL_0]], %[[CST_1]], %[[CST_0]]) : (tensor<3xi32>, tensor<1xi64>, tensor<1xi32>) -> tensor<1xi32>
-// CHECK          %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) {squeeze_dims = [0]} : (tensor<1xi32>) -> tensor<i32>
+// CHECK          %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) <{squeeze_dims = [0]}> : (tensor<1xi32>) -> tensor<i32>
 // CHECK          return %[[VAL_2]] : tensor<i32>
 func.func @get_dimension_size_dynamic(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
   %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64} : (tensor<4x256x?xf32>) -> tensor<i32>
@@ -5044,10 +5044,10 @@ func.func @get_dimension_size_dynamic(%arg0: tensor<4x256x?xf32>) -> tensor<i32>
 
 // CHECK-LABEL: func @dynamic_iota_i32_1d(
 // CHECK-SAME:                  %[[ARG_0:.*]]: tensor<1xi32>) -> tensor<?xi32> {
-// CHECK-DAG:     %[[CST_0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+// CHECK-DAG:     %[[CST_0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
 // CHECK:         %[[VAL_0:.*]] = "tf.Reshape"(%arg0, %[[CST_0]]) : (tensor<1xi32>, tensor<0xi32>) -> tensor<i32>
-// CHECK-DAG:     %[[CST_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:     %[[CST_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:     %[[CST_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:     %[[CST_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:         %[[VAL_1:.*]] = "tf.Range"(%[[CST_1]], %[[VAL_0]], %[[CST_2]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:         return %[[VAL_1]] : tensor<?xi32>
 func.func @dynamic_iota_i32_1d(%arg0: tensor<1xi32>) -> tensor<?xi32> {
@@ -5057,11 +5057,11 @@ func.func @dynamic_iota_i32_1d(%arg0: tensor<1xi32>) -> tensor<?xi32> {
 
 // CHECK-LABEL: func @dynamic_iota_f32_1d(
 // CHECK-SAME:                  %[[ARG_0:.*]]: tensor<1xi32>) -> tensor<?xf32> {
-// CHECK:         %[[VAL_0:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1xf32>
-// CHECK-DAG:     %[[CST_0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+// CHECK:         %[[VAL_0:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xf32>
+// CHECK-DAG:     %[[CST_0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
 // CHECK:         %[[VAL_1:.*]] = "tf.Reshape"(%[[VAL_0]], %[[CST_0]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
-// CHECK-DAG:     %[[CST_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG:     %[[CST_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG:     %[[CST_1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG:     %[[CST_2:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK:         %[[VAL_2:.*]] = "tf.Range"(%[[CST_1]], %[[VAL_1]], %[[CST_2]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
 // CHECK:         return %[[VAL_2]] : tensor<?xf32>
 func.func @dynamic_iota_f32_1d(%arg0: tensor<1xi32>) -> tensor<?xf32> {
@@ -5073,8 +5073,8 @@ func.func @dynamic_iota_f32_1d(%arg0: tensor<1xi32>) -> tensor<?xf32> {
 // CHECK-SAME:              %arg0: tensor<1x?x4x256xf32>,
 // CHECK-SAME:              %arg1: tensor<4xi32>,
 // CHECK-SAME:              %arg2: tensor<4xi32>) -> tensor<1x?x4x128xf32> {
-// CHECK:         %cst = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:         %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %cst) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1x?x4x256xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x?x4x128xf32>
+// CHECK:         %cst = "tf.Const"() <{value = dense<1> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK:         %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %cst) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<1x?x4x256xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x?x4x128xf32>
 // CHECK:         return %0 : tensor<1x?x4x128xf32>
 func.func @real_dynamic_slice_strides_equal_to_1_signed(%arg0: tensor<1x?x4x256xf32>, %arg1: tensor<4xi32>, %arg2: tensor<4xi32>) -> tensor<1x?x4x128xf32> {
 %cst = mhlo.constant dense<1> : tensor<4xi32>
@@ -5086,8 +5086,8 @@ func.return %0 : tensor<1x?x4x128xf32>
 // CHECK-SAME:              %arg0: tensor<1x?x2x4xf32>,
 // CHECK-SAME:              %arg1: tensor<4xi32>,
 // CHECK-SAME:              %arg2: tensor<4xi32>) -> tensor<1x?x1x2xf32> {
-// CHECK          %cst = "tf.Const"() {value = dense<2> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK          %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %cst) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1x?x2x4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x?x1x2xf32>
+// CHECK          %cst = "tf.Const"() <{value = dense<2> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK          %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %cst) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<1x?x2x4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x?x1x2xf32>
 // CHECK          return %0 : tensor<1x?x1x2xf32>
 func.func @real_dynamic_slice_strides_not_equal_to_1(%arg0: tensor<1x?x2x4xf32>, %arg1: tensor<4xi32>, %arg2: tensor<4xi32>) -> tensor<1x?x1x2xf32> {
 %cst = mhlo.constant dense<2> : tensor<4xi32>
@@ -5113,7 +5113,7 @@ func.func @remove_shape_assertion_custom_call(%arg1: tensor<?x5xi32>) -> tensor<
 // CHECK-SAME:                                        %[[ARG_1:.*]]: tensor<1x4xi32>,
 // CHECK-SAME:                                        %[[ARG_2:.*]]: tensor<f32>,
 // CHECK-SAME:                                        %[[ARG_3:.*]]: tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
-// CHECK:          %[[VALUES:.*]], %[[INDICES:.*]] = "tf.ApproxTopK"(%[[ARG_0]]) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+// CHECK:          %[[VALUES:.*]], %[[INDICES:.*]] = "tf.ApproxTopK"(%[[ARG_0]]) <{aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
 // CHECK:          return %[[VALUES]], %[[INDICES]] : tensor<1x4xf32>, tensor<1x4xi32>
 // CHECK:        }
 func.func @convert_approx_top_k_custom_call(%arg0: tensor<1x4xf32>, %arg1: tensor<1x4xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
index e822963984cf62..4e12ffd931c5f2 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
@@ -1,9 +1,22 @@
-//RUN: tf_tfl_translate --enable-stablehlo-conversion --input-mlir %s -o /tmp/temp.stablehlo; [ -f /tmp/temp.stablehlo ]
+//RUN: tf_tfl_translate --enable-stablehlo-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
 
 
 module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
+func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
+  %1 = arith.constant dense<1> : tensor<1xi32>
+  %2 = arith.constant dense<2.0> : tensor<1x1x2xf32>
+  %3 = "tf.InplaceUpdate"(%arg0, %1, %2) {device = ""}
+    : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
+  func.return %3 : tensor<2x1x2xf32>
 }
-}
\ No newline at end of file
+}
+
+//CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {keep_stablehlo_constant = "true"}, tfl.schema_version = 3 : i32} {
+//CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "stablehlo.dynamic_update_slice"}} {
+//CHECK-DAG:    %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
+//CHECK-DAG:    %1 = stablehlo.constant dense<1> : tensor<i32>
+//CHECK-DAG:    %2 = stablehlo.constant dense<0> : tensor<i32>
+//CHECK-NEXT:   %3 = stablehlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+//CHECK-NEXT:   return %3 : tensor<2x1x2xf32>
+//CHECK-NEXT:  }
+//CHECK-NEXT:}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
index bd2f56dd930bb5..073f31e39786d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
@@ -132,13 +132,13 @@ func.func @batchNormInference_f16_overflow(
 func.func @batchNormTraining_4D_middle_features(
     %x: tensor<3x4x256x6xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>)
     -> (tensor<3x4x256x6xf32>) {
-  // CHECK-DAG: %[[CST_AXIS:.+]] = "tf.Const"() {value = dense<[0, 1, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-DAG: %[[CST_AXIS:.+]] = "tf.Const"() <{value = dense<[0, 1, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK-DAG: %[[X_SHAPE:.+]] = shape.const_shape [3, 4, 256, 6] : tensor<4xindex>
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e+00> : tensor<256xf32>
-  // CHECK-DAG: %[[MEAN:.+]] = "tf.Mean"(%arg0, %[[CST_AXIS]]) {keep_dims = false} : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
+  // CHECK-DAG: %[[MEAN:.+]] = "tf.Mean"(%arg0, %[[CST_AXIS]]) <{keep_dims = false}> : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
   // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>, tensor<4xindex>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[SQ_DIFF:.+]] = "tf.SquaredDifference"(%arg0, %[[MEAN_BCAST]]) : (tensor<3x4x256x6xf32>, tensor<3x4x256x6xf32>) -> tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[VARIANCE:.+]] = "tf.Mean"(%[[SQ_DIFF]], %[[CST_AXIS]]) {keep_dims = false} : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE:.+]] = "tf.Mean"(%[[SQ_DIFF]], %[[CST_AXIS]]) <{keep_dims = false}> : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS]] : tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS_RSQRT:.+]] = mhlo.rsqrt %[[VARIANCE_EPS]] : tensor<256xf32>
   // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
@@ -152,4 +152,4 @@ func.func @batchNormTraining_4D_middle_features(
       {epsilon = 1.0 : f32, feature_index = 2 : i64} :
       (tensor<3x4x256x6xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<3x4x256x6xf32>, tensor<256xf32>, tensor<256xf32>)
   func.return %0 : tensor<3x4x256x6xf32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index 562afd53f6f76c..df3a5f62e8ff59 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h"
@@ -34,6 +35,9 @@ namespace odml {
 void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
                             bool smuggle_disallowed_ops) {
   pm.addPass(CreateRenameEntrypointToMainPass());
+
+  // if the input is a call_xla_module, then unwrap the content
+  pm.addPass(mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
   // TODO(b/230572023): Consider improving shape inference for While op instead
   // of dropping the attribute. This need not be correct for models not trained
   // on TPU.
diff --git a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
index ccb5507a8ee5c5..378ed7fb2a46e1 100644
--- a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
@@ -10,7 +10,7 @@ func.func @testDilatedConv(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8
 
   // CHECK-LABEL: testDilatedConv
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x120x120x8xf32>
 }
 
@@ -24,7 +24,7 @@ func.func @testDilatedConvWithNonConstantPadAndCrops(%arg0: tensor<1x128x128x3xf
 
   // CHECK-LABEL: testDilatedConvWithNonConstantPadAndCrops
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x120x120x8xf32>
 }
 
@@ -39,7 +39,7 @@ func.func @testDilatedConvWithNonZeroBasePadding(%arg0: tensor<1x128x128x3xf32>,
 
   // CHECK-LABEL: testDilatedConvWithNonZeroBasePadding
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
@@ -54,7 +54,7 @@ func.func @testDilatedConvWithFp16(%arg0 : tensor<1x20x30x40xf16>, %arg1: tensor
 
   // CHECK-LABEL: testDilatedConvWithFp16
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x20x30x40xf16>, [[FILTER:%.*]]: tensor<5x5x40x32xf16>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {data_format = "NHWC", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x20x30x40xf16>, tensor<5x5x40x32xf16>) -> tensor<1x20x30x32xf16>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{data_format = "NHWC", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x20x30x40xf16>, tensor<5x5x40x32xf16>) -> tensor<1x20x30x32xf16>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x20x30x32xf16>
 }
 
@@ -85,7 +85,7 @@ func.func @testDilatedDepthWiseConv(%arg0: tensor<1x128x128x3xf32>, %arg1: tenso
 
   // CHECK-LABEL: testDilatedDepthWiseConv
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
@@ -103,7 +103,7 @@ func.func @testDilatedConvWithPad(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<
 
   // CHECK-LABEL: testDilatedConvWithPad
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
@@ -122,7 +122,7 @@ func.func @testDilatedDepthWiseConvWithPad(%arg0: tensor<1x128x128x3xf32>, %arg1
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithPad
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
@@ -139,7 +139,7 @@ func.func @testDilatedConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %arg1: ten
 
   // CHECK-LABEL: testDilatedConvWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
@@ -156,7 +156,7 @@ func.func @testDilatedDepthWiseConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
@@ -176,10 +176,10 @@ func.func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze1
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -199,10 +199,10 @@ func.func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf3
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze1
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -222,10 +222,10 @@ func.func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<?xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -245,10 +245,10 @@ func.func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf3
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<?xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -270,10 +270,10 @@ func.func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze3
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -295,10 +295,10 @@ func.func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf3
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze3
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [3]}> : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
@@ -407,10 +407,10 @@ func.func @testDilatedConv1DExpandH(%arg0: tensor<1x128x3xf32>, %arg1: tensor<1x
 
   // CHECK-LABEL: testDilatedConv1DExpandH
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x3xf32>, [[FILTER:%.*]]: tensor<1x5x3x8xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<-3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<-3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x3xf32>, tensor<i32>) -> tensor<1x1x128x3xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [-3]} : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [-3]}> : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x8xf32>
 }
 
@@ -429,10 +429,10 @@ func.func @testDilatedConv1DExpandHWithBiasAdd(%arg0: tensor<1x128x3xf32>, %arg1
 
   // CHECK-LABEL: testDilatedConv1DExpandHWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x3xf32>, [[FILTER:%.*]]: tensor<1x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<-3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<-3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x3xf32>, tensor<i32>) -> tensor<1x1x128x3xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [-3]} : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [-3]}> : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x8xf32>, tensor<8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x8xf32>
 }
@@ -451,10 +451,10 @@ func.func @testDilatedConv1DExpandW(%arg0: tensor<1x128x3xf32>, %arg1: tensor<5x
 
   // CHECK-LABEL: testDilatedConv1DExpandW
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x3xf32>, [[FILTER:%.*]]: tensor<5x1x3x8xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<-2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<-2> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x3xf32>, tensor<i32>) -> tensor<1x128x1x3xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x1x3xf32>, tensor<5x1x3x8xf32>) -> tensor<1x128x1x8xf32>
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [-2]} : (tensor<1x128x1x8xf32>) -> tensor<1x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x1x3xf32>, tensor<5x1x3x8xf32>) -> tensor<1x128x1x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [-2]}> : (tensor<1x128x1x8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x8xf32>
 }
 
@@ -473,10 +473,10 @@ func.func @testDilatedConv1DExpandWWithBiasAdd(%arg0: tensor<1x128x3xf32>, %arg1
 
   // CHECK-LABEL: testDilatedConv1DExpandWWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x3xf32>, [[FILTER:%.*]]: tensor<5x1x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<-2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<-2> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x3xf32>, tensor<i32>) -> tensor<1x128x1x3xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x1x3xf32>, tensor<5x1x3x8xf32>) -> tensor<1x128x1x8xf32>
-  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [-2]} : (tensor<1x128x1x8xf32>) -> tensor<1x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 2, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x128x1x3xf32>, tensor<5x1x3x8xf32>) -> tensor<1x128x1x8xf32>
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [-2]}> : (tensor<1x128x1x8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x8xf32>, tensor<8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x8xf32>
 }
@@ -495,10 +495,10 @@ func.func @testDilatedConv1DWithMixedPostiveAndNegativeAxis(%arg0: tensor<1x128x
 
   // CHECK-LABEL: testDilatedConv1DWithMixedPostiveAndNegativeAxis
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x3xf32>, [[FILTER:%.*]]: tensor<1x5x3x8xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x3xf32>, tensor<i32>) -> tensor<1x1x128x3xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [-3]} : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{dilations = [1, 1, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x1x128x3xf32>, tensor<1x5x3x8xf32>) -> tensor<1x1x128x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [-3]}> : (tensor<1x1x128x8xf32>) -> tensor<1x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x8xf32>
 }
 
@@ -518,11 +518,11 @@ func.func @testPaddedDilatedConv(%arg0 : tensor<2x1920x64xf32>) ->  tensor<2x192
 
   // CHECK-LABEL: testPaddedDilatedConv
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<2x1920x64xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: [[FILTER:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x1x64x128xf32>} : () -> tensor<3x1x64x128xf32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-NEXT: [[FILTER:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3x1x64x128xf32>}> : () -> tensor<3x1x64x128xf32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) {device = ""} : (tensor<2x1920x64xf32>, tensor<i32>) -> tensor<2x1920x1x64xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {data_format = "NHWC", device = "", dilations = [1, 2, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<2x1920x1x64xf32>, tensor<3x1x64x128xf32>) -> tensor<2x1920x1x128xf32>
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) {device = "", squeeze_dims = [2]} : (tensor<2x1920x1x128xf32>) -> tensor<2x1920x128xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) <{data_format = "NHWC", dilations = [1, 2, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<2x1920x1x64xf32>, tensor<3x1x64x128xf32>) -> tensor<2x1920x1x128xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.Squeeze"([[CONV]]) <{squeeze_dims = [2]}> {device = ""} : (tensor<2x1920x1x128xf32>) -> tensor<2x1920x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<2x1920x128xf32>
 }
 
@@ -539,7 +539,7 @@ func.func @testDilatedConvInterleaved(%arg0: tensor<1x128x128x3xf32>, %arg1: ten
 
   // CHECK-LABEL: testDilatedConvInterleaved
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT0:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
-  // CHECK-NEXT: [[RESULT1:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
+  // CHECK-NEXT: [[RESULT0:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
+  // CHECK-NEXT: [[RESULT1:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) <{dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x120x120x8xf32>
   // CHECK-NEXT: return [[RESULT0]], [[RESULT1]] : tensor<1x120x120x8xf32>, tensor<1x120x120x8xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir
index f29afb30846142..7ea7e48777522e 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/if_op.mlir
@@ -1,7 +1,7 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
 // Confirm function references in if ops are preserved
 func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
-// CHECK:   %{{.*}} = "tf.If"(%{{.*}}, %{{.*}}, %{{.*}}) {else_branch = @cond_false, is_stateless = false, then_branch = @cond_true} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   %{{.*}} = "tf.If"(%{{.*}}, %{{.*}}, %{{.*}}) <{else_branch = @cond_false, is_stateless = false, then_branch = @cond_true}> : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   %0 = "tfl.less"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1>
   %1 = "tf.If"(%0, %arg0, %arg1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   func.return %1 : tensor<1xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir
new file mode 100644
index 00000000000000..6b76b31c9a52bf
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir
@@ -0,0 +1,9 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+// CHECK: tfl.metadata_buffer = [3 : i32, 7 : i32]
+module attributes {tfl.metadata_buffer = [3 : i32, 7 : i32]} {
+  func.func @main(%arg0: tensor<i32>, %arg1: tensor<3x2xi32>) -> tensor<3x2xi32> {
+    %0 = "tfl.add" (%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+    func.return %0 : tensor<3x2xi32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
index a807cc84ee0a78..ab9b39bd94cb97 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
@@ -66,7 +66,7 @@ func.func @while_cond_10_frozen0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %ar
 // CANON:           },  {
 // CANON:           ^bb0([[VAL_11:%.*]]: tensor<*xi32>, [[VAL_12:%.*]]: tensor<*xi32>, [[VAL_13:%.*]]: tensor<*xf32>):
 // CANON-DAG:         [[VAL_4:%.*]] = arith.constant dense<1> : tensor<i32>
-// CANON-DAG:         [[VAL_5:%.*]] = "tf.Const"() {value = dense<2.560000e+02> : tensor<256x256xf32>} : () -> tensor<?x?xf32>
+// CANON-DAG:         [[VAL_5:%.*]] = "tf.Const"() <{value = dense<2.560000e+02> : tensor<256x256xf32>}> : () -> tensor<?x?xf32>
 // CANON:             [[VAL_14:%.*]] = "tf.AddV2"([[VAL_12]], [[VAL_4]])
 // CANON:             [[VAL_15:%.*]] = "tf.AddV2"([[VAL_13]], [[VAL_5]])
 // CANON:             [[VAL_16:%.*]] = "tf.AddV2"([[VAL_11]], [[VAL_4]])
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 444a494f73769a..685efd5be0ca2d 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -198,7 +198,7 @@ func.func @fakeQuantVarsTrue(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<f32>, %ar
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantVarsTrue
-  // CHECK: "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {max = 1.000000e+00 : f32, min = 0.000000e+00 : f32, narrow_range = true, num_bits = 5 : i64}
+  // CHECK: "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) <{narrow_range = true, num_bits = 5 : i64}> {max = 1.000000e+00 : f32, min = 0.000000e+00 : f32}
 }
 
 func.func @fakeQuantArgsFalse4Bits(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
@@ -235,7 +235,7 @@ func.func @fakeQuantVarsTrue4Bits(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<f32>
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantVarsTrue
-  // CHECK: "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {max = 1.000000e+00 : f32, min = 0.000000e+00 : f32, narrow_range = true, num_bits = 3 : i64}
+  // CHECK: "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) <{narrow_range = true, num_bits = 3 : i64}> {max = 1.000000e+00 : f32, min = 0.000000e+00 : f32}
 }
 
 func.func @const() -> tensor<2xi32> {
@@ -1421,7 +1421,7 @@ func.func @strided_slice_big_dims(%arg0: tensor<5x6x7xf32>, %arg1: tensor<3xi32>
   %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
   func.return %0 : tensor<1x1x5x6x7xf32>
   // CHECK-LABEL: strided_slice_big_dims
-  // CHECK: %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, offset = false, shrink_axis_mask = 0 : i64} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
+  // CHECK: %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, shrink_axis_mask = 0 : i64}> {offset = false} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
 }
 
 func.func @slice1Tensor(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>) -> tensor<?x3x5xf32> {
@@ -1606,7 +1606,7 @@ func.func @sparse_to_dense_with_2d_sparse_indices_and_second_dim_greater_than_4(
   %0 = "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) {validate_indices = true}: (tensor<3x5xi32>, tensor<3xi32>, tensor<2xf32>, tensor<f32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
   // CHECK-LABEL: sparse_to_dense_with_2d_sparse_indices_and_second_dim_greater_than_4
-  // CHECK: "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) {validate_indices = true} : (tensor<3x5xi32>, tensor<3xi32>, tensor<2xf32>, tensor<f32>) -> tensor<?x?x?xf32>
+  // CHECK: "tf.SparseToDense"(%arg0, %arg1, %arg2, %arg3) <{validate_indices = true}> : (tensor<3x5xi32>, tensor<3xi32>, tensor<2xf32>, tensor<f32>) -> tensor<?x?x?xf32>
 }
 
 func.func @where(%arg0: tensor<3x5xi1>) -> tensor<?x2xi64> {
@@ -2311,7 +2311,7 @@ func.func @conv3d_invalid_strides(%arg0: tensor<?x?x?x?x?xf32>,%arg1:  tensor<?x
   %0 = "tf.Conv3D"(%arg0, %arg1) {padding = "SAME", strides = [2, 1, 1, 1, 1]} : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
   func.return %0: tensor<?x?x?x?x?xf32>
   // CHECK-LABEL: conv3d_invalid_strides
-  // CHECK:  [[BCT:%.*]] = "tf.Conv3D"(%arg0, %arg1) {padding = "SAME", strides = [2, 1, 1, 1, 1]} : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  // CHECK:  [[BCT:%.*]] = "tf.Conv3D"(%arg0, %arg1) <{padding = "SAME", strides = [2, 1, 1, 1, 1]}> : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
   // CHECK:  return [[BCT]] : tensor<?x?x?x?x?xf32>
 }
 
@@ -2705,7 +2705,7 @@ func.func @approx_top_k_with_min_k(%arg0: tensor<1x4xf32>) -> (tensor<1x4xf32>,
   func.return %values, %indices: tensor<1x4xf32>, tensor<1x4xi32>
 
   // CHECK-LABEL: approx_top_k_with_min_k
-  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = false, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) <{aggregate_to_topk = true, is_max_k = false, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
   // CHECK:  return %values, %indices : tensor<1x4xf32>, tensor<1x4xi32>
 }
 
@@ -2714,7 +2714,7 @@ func.func @approx_top_k_reduction_dimension_not_last_dim(%arg0: tensor<1x4xf32>)
   func.return %values, %indices: tensor<1x4xf32>, tensor<1x4xi32>
 
   // CHECK-LABEL: approx_top_k_reduction_dimension_not_last_dim
-  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 0 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) <{aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 0 : i64, reduction_input_size_override = -1 : i64}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
   // CHECK:  return %values, %indices : tensor<1x4xf32>, tensor<1x4xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir b/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
index a03519fbdd1d6f..8ed0fe8cac86cf 100644
--- a/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
@@ -20,7 +20,7 @@ func.func @TfBatchMatMulV2(%arg0: tensor<4x128x2xf32>, %arg1:  tensor<2x1xf32>)
     custom_option = #tfl<const_bytes : "0x0D42617463684D61744D756C56320038120D42617463684D61744D756C56321A001A002A070A0154120230012A0B0A0561646A5F78120228002A0B0A0561646A5F791202280032000002493B1414042801">
   } : (tensor<4x128x2xf32>, tensor<2x1xf32>) -> tensor<4x128x1xf32>
 
-// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false} : (tensor<4x128x2xf32>, tensor<2x1xf32>) -> tensor<4x128x1xf32>
+// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {T = f32} : (tensor<4x128x2xf32>, tensor<2x1xf32>) -> tensor<4x128x1xf32>
   func.return %0 : tensor<4x128x1xf32>
 }
 
@@ -66,7 +66,7 @@ func.func @TfMapDataset(%arg0: tensor<!tf_type.variant>) -> (tensor<!tf_type.var
 
   func.return %0 : tensor<!tf_type.variant>
 // CHECK: "tf.MapDataset"(
-// CHECK-SAME: {Targuments = [], f = @{{.*}}, metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], preserve_cardinality = true, use_inter_op_parallelism = true}
+// CHECK-SAME: <{f = @{{.*}}, metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], preserve_cardinality = true, use_inter_op_parallelism = true}> {Targuments = []}
 }
 
 // CHECK-LABEL: TfTakeWhileDataset
@@ -78,7 +78,7 @@ func.func @TfTakeWhileDataset(%arg0: tensor<!tf_type.variant>, %arg1: tensor<!tf
 
   func.return %0 : tensor<!tf_type.variant>
 // CHECK: "tf.TakeWhileDataset"(
-// CHECK-SAME: {Targuments = [!tf_type.resource, !tf_type.resource, i64, !tf_type.resource, !tf_type.resource, !tf_type.resource, !tf_type.resource, i64], metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], predicate = @{{.*}}}
+// CHECK-SAME: <{metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], predicate = @{{.*}}}> {Targuments = [!tf_type.resource, !tf_type.resource, i64, !tf_type.resource, !tf_type.resource, !tf_type.resource, !tf_type.resource, i64]}
 }
 
 // CHECK-LABEL: FailureOnInvalidOp
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 3bfeceee049626..79d969106d868a 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -4,9 +4,9 @@
 
 // CHECK-LABEL: tensorlistConst
 func.func @tensorlistConst(%arg0 : tensor<1xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: %[[ELEMENT0:.*]] = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK-DAG: %[[ELEMENT1:.*]] = "tf.Const"() {value = dense<[3, 4, 5]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK: %[[LIST:.*]] = "tf.Pack"(%[[ELEMENT0]], %[[ELEMENT1]]) {axis = 0 : i64} : (tensor<3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  // CHECK-DAG: %[[ELEMENT0:.*]] = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  // CHECK-DAG: %[[ELEMENT1:.*]] = "tf.Const"() <{value = dense<[3, 4, 5]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  // CHECK: %[[LIST:.*]] = "tf.Pack"(%[[ELEMENT0]], %[[ELEMENT1]]) <{axis = 0 : i64}> : (tensor<3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %0 = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F56415249414E542074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A2022485C6E5C30323674656E736F72666C6F773A3A54656E736F724C6973745C3032325C3032305C3030305C3030335C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3030315C3032325C3030325C3031305C3030335C3033325C725C3031305C3030335C3032325C3030345C3032325C3030325C3031305C3030333A5C3030335C3030305C3030315C3030325C3033325C725C3031305C3030335C3032325C3030345C3032325C3030325C3031305C3030333A5C3030335C3030335C3030345C30303522"> : tensor<!tf_type.variant>} : () -> tensor<!tf_type.variant<tensor<3xi32>>>
 
   // CHECK: return %[[LIST]]
@@ -20,7 +20,7 @@ func.func @tensorlistConst(%arg0 : tensor<1xi32>) -> tensor<2x3xi32> {
 func.func @emptyTensorlistConst(%arg0 : tensor<1xi32>) -> tensor<0x3xi32> {
   %0 = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F56415249414E542074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20222A5C6E5C30323674656E736F72666C6F773A3A54656E736F724C6973745C3032325C3032305C3030305C3030335C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3337375C3030315C3032325C3030325C3031305C30303322"> : tensor<!tf_type.variant>} : () -> tensor<!tf_type.variant<tensor<3xi32>>>
 
-  // CHECK: "tf.Const"() {value = dense<> : tensor<0x3xi32>} : () -> tensor<0x3xi32>
+  // CHECK: "tf.Const"() <{value = dense<> : tensor<0x3xi32>}> : () -> tensor<0x3xi32>
   // CHECK-NOT: tf.TensorListStack
   %1 = "tf.TensorListStack"(%0, %arg0) : (tensor<!tf_type.variant<tensor<3xi32>>>, tensor<1xi32>) -> tensor<0x3xi32>
   func.return %1 : tensor<0x3xi32>
@@ -35,7 +35,7 @@ func.func @tensorlistGetItem(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg
   %2 = "tf.TensorListStack"(%0, %arg1) : (tensor<!tf_type.variant<tensor<10xf32>>>, tensor<1xi32>) -> tensor<3x10xf32>
   func.return %1, %2 : tensor<10xf32>, tensor<3x10xf32>
 
-// CHECK:  %0 = "tf.Gather"(%arg0, %arg2) {validate_indices = true} : (tensor<3x10xf32>, tensor<i32>) -> tensor<10xf32>
+// CHECK:  %0 = "tf.Gather"(%arg0, %arg2) <{validate_indices = true}> : (tensor<3x10xf32>, tensor<i32>) -> tensor<10xf32>
 // CHECK: return %0, %arg0 : tensor<10xf32>, tensor<3x10xf32>
 }
 
@@ -48,7 +48,7 @@ func.func @tensorlistGetItemWithUnknownRank(%arg0: tensor<*xf32>, %arg1: tensor<
   %2 = "tf.TensorListStack"(%0, %arg1) : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<*xf32>
   func.return %1, %2 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK:  %0 = "tf.Gather"(%arg0, %arg2) {validate_indices = true} : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK:  %0 = "tf.Gather"(%arg0, %arg2) <{validate_indices = true}> : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK: return %0, %arg0 : tensor<*xf32>, tensor<*xf32>
 }
 
@@ -175,7 +175,7 @@ func.func @tensorlistReserve(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: te
 // CHECK-DAG:  [[SHAPE:%.*]] = "tf.Concat"([[ZERO2]], [[DIM0]], %arg0) : (tensor<i32>, tensor<1xi32>, tensor<3xi32>) -> tensor<4xi32>
 // CHECK-DAG:  [[VALUES:%.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:      [[LIST:%.*]] = "tf.Fill"([[SHAPE]], [[VALUES]]) : (tensor<4xi32>, tensor<f32>) -> tensor<?x?x?x?xf32>
-// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], %arg2) {validate_indices = true} : (tensor<?x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], %arg2) <{validate_indices = true}> : (tensor<?x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
 // CHECK:      return [[RESULT]] : tensor<?x?x?xf32>
 }
 
@@ -188,7 +188,7 @@ func.func @tensorlistReserveUnrankedElements(%arg0: tensor<?xi32>, %arg1: tensor
   func.return %1 : tensor<*xf32>
 
 // CHECK:  [[RESULT:%[0-9]+]] = "tf.Fill"{{.*}}(tensor<?xi32>, tensor<f32>) -> tensor<*xf32>
-// CHECK:  [[RESULT2:%[0-9]+]] = "tf.Gather"{{.*}}{validate_indices = true} : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK:  [[RESULT2:%[0-9]+]] = "tf.Gather"{{.*}}<{validate_indices = true}> : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK:  return [[RESULT2]] : tensor<*xf32>
 }
 
@@ -208,7 +208,7 @@ func.func @tensorlistReserveConstantUnknownElementShapeDim(%arg0: tensor<i32>, %
 // CHECK-DAG:  [[SHAPE:%.*]] = "tf.Concat"([[ZERO2]], [[DIM0]], [[ELEMENT_SHAPE]]) : (tensor<i32>, tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
 // CHECK-DAG:  [[VALUES:%.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:      [[LIST:%.*]] = "tf.Fill"([[SHAPE]], [[VALUES]]) : (tensor<3xi32>, tensor<f32>) -> tensor<?x?x7xf32>
-// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], %arg1) {validate_indices = true} : (tensor<?x?x7xf32>, tensor<i32>) -> tensor<?x7xf32>
+// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], %arg1) <{validate_indices = true}> : (tensor<?x?x7xf32>, tensor<i32>) -> tensor<?x7xf32>
 // CHECK:      return [[RESULT]] : tensor<?x7xf32>
 }
 
@@ -245,7 +245,7 @@ func.func @tensorlistReserveUnrankedElementShape(%arg0: tensor<*xi32>, %arg1: te
 // CHECK-DAG:  [[CONCAT:%.*]] = "tf.Concat"([[AXIS_1]], [[EXPAND_DIM]], %arg0) : (tensor<i32>, tensor<1xi32>, tensor<*xi32>) -> tensor<?xi32>
 // CHECK:  [[CST:%.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:  [[FILL:%.*]] = "tf.Fill"([[CONCAT]], [[CST]]) : (tensor<?xi32>, tensor<f32>) -> tensor<*xf32>
-// CHECK:  [[GATHER:%.*]] = "tf.Gather"([[FILL]], %arg2) {validate_indices = true} : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK:  [[GATHER:%.*]] = "tf.Gather"([[FILL]], %arg2) <{validate_indices = true}> : (tensor<*xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK:  return [[GATHER]] : tensor<*xf32>
 }
 
@@ -263,7 +263,7 @@ func.func @EmptyTensorList(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tens
 // CHECK-DAG:  [[SHAPE:%.*]] = "tf.Concat"([[ZERO]], [[DIM0]], [[ELEM_SHAPE]]) : (tensor<i32>, tensor<1xi32>, tensor<3xi32>) -> tensor<4xi32>
 // CHECK-DAG:  [[VALUES:%.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
 // CHECK:      [[LIST:%.*]] = "tf.Fill"([[SHAPE]], [[VALUES]]) : (tensor<4xi32>, tensor<f32>) -> tensor<0x?x?x?xf32>
-// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], [[IDX]]) {validate_indices = true} : (tensor<0x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+// CHECK:      [[RESULT:%.*]] = "tf.Gather"([[LIST]], [[IDX]]) <{validate_indices = true}> : (tensor<0x?x?x?xf32>, tensor<i32>) -> tensor<?x?x?xf32>
 // CHECK:      return [[RESULT]] : tensor<?x?x?xf32>
 }
 
@@ -294,7 +294,7 @@ func.func @tensorlistLength(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>) -> (t
 // CHECK-SAME: ([[INPUT:%.*]]: tensor<3x10xf32>, [[ELEM_SHAPE:%.*]]: tensor<1xi32>)
 // CHECK-DAG: [[SHAPE:%.*]] = "tf.Shape"([[INPUT]]) {{.*}} -> tensor<2xi32>
 // CHECK-DAG: [[ZERO:%cst.*]] = arith.constant dense<0> : tensor<i32>
-// CHECK: [[RESULT:%.*]] = "tf.Gather"([[SHAPE]], [[ZERO]]) {validate_indices = true} : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
+// CHECK: [[RESULT:%.*]] = "tf.Gather"([[SHAPE]], [[ZERO]]) <{validate_indices = true}> : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
 // CHECK: return [[RESULT]] : tensor<i32>
 }
 
@@ -352,7 +352,8 @@ func.func @tensorlistWhileRegion(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
   %cst_1 = arith.constant dense<-1> : tensor<i32>
   %0 = "tf.TensorListFromTensor"(%arg0, %cst) : (tensor<2x3xf32>, tensor<1xi32>) -> tensor<!tf_type.variant<tensor<3xf32>>>
   // CHECK: "tf.WhileRegion"
-  %1:2 = "tf.WhileRegion"(%cst_0, %0) ({
+  // CHECK: <{is_stateless = false}>
+  %1:2 = "tf.WhileRegion"(%cst_0, %0) <{is_stateless = false}> ({
       ^bb0(%carg0: tensor<i32>, %carg1: tensor<!tf_type.variant>):
        %cst_2 = arith.constant dense<2> : tensor<i32>
        %1 = "tf.Less"(%carg0, %cst_2) : (tensor<i32>, tensor<i32>) -> tensor<i1>
@@ -376,9 +377,9 @@ func.func @tensorlistWhileRegion(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
       // CHECK-NOT: tensor<!tf_type.variant>
       // CHECK:  "tf.Yield"(%[[LEN]], %[[BARG1]]) : (tensor<i32>, tensor<*xf32>) -> ()
 
-  }) {is_stateless = false} : (tensor<i32>, tensor<!tf_type.variant<tensor<3xf32>>>) -> (tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>)
+  }) : (tensor<i32>, tensor<!tf_type.variant<tensor<3xf32>>>) -> (tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>)
   // make sure the variant types in input/output have been updated
-  // CHECK: {is_stateless = false} : (tensor<i32>, tensor<2x3xf32>) -> (tensor<i32>, tensor<*xf32>)
+  // : (tensor<i32>, tensor<2x3xf32>) -> (tensor<i32>, tensor<*xf32>)
   %2 = "tf.TensorListStack"(%1#1, %cst_1) : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>) -> tensor<*xf32>
   // CHECK:  return %0#1 : tensor<*xf32>
   func.return %2 : tensor<*xf32>
@@ -443,11 +444,11 @@ func.func @tensorlistResize(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2
 // CHECK:  [[ZERO:%.*]] = arith.constant dense<0> : tensor<i32>
 // CHECK:  [[SHAPE:%.*]] = "tf.Shape"([[INPUT]]) : (tensor<3x10xf32>) -> tensor<2xi32>
 // CHECK:  [[ZERO_1:%.*]] = arith.constant dense<0> : tensor<i32>
-// CHECK:  [[INPUT_SIZE:%.*]] = "tf.Gather"([[SHAPE]], [[ZERO_1]]) {validate_indices = true} : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
+// CHECK:  [[INPUT_SIZE:%.*]] = "tf.Gather"([[SHAPE]], [[ZERO_1]]) <{validate_indices = true}> : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
 // CHECK:  [[SIZE_DIFF:%.*]] = "tf.Sub"([[SIZE]], [[INPUT_SIZE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:  [[DIFF_RES:%.*]] = "tf.Greater"([[SIZE_DIFF]], [[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:  [[SHAPE_1:%.*]] = "tf.Shape"([[INPUT]]) : (tensor<3x10xf32>) -> tensor<?xi32>
-// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
+// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) <{else_branch = @cond_false, is_stateless = true, then_branch = @cond_true}> : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
 // CHECK:  return [[RESULT]] : tensor<?x10xf32>
 }
 
@@ -510,7 +511,7 @@ func.func @tensorlistConcat(%arg0: tensor<3x2x2xf32>, %lead: tensor<i64>) -> (te
   func.return %t#0, %t#1 : tensor<?x2xf32>, tensor<0xi64>
 
 // CHECK: [[ELEMENT_SHAPE:%.*]] = arith.constant dense<2> : tensor<2xi32>
-// CHECK: [[UNPACK:%.*]]:3 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<3x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>)
+// CHECK: [[UNPACK:%.*]]:3 = "tf.Unpack"(%arg0) <{axis = 0 : i64}> : (tensor<3x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>)
 // CHECK: [[SCALAR_ZERO:%.*]] = arith.constant dense<0> : tensor<i32>
 // CHECK: [[CONCAT:%.*]] = "tf.Concat"([[SCALAR_ZERO]], [[UNPACK]]#0, [[UNPACK]]#1, [[UNPACK]]#2) : (tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
 // CHECK: [[LENGTHS:%.*]] = arith.constant dense<0> : tensor<0xi64>
@@ -567,7 +568,7 @@ func.func @tensorListIf(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: te
 // CHECK: func @tensorListIf
 // CHECK-NEXT:  %cst = arith.constant dense<2> : tensor<i32>
 // CHECK-NEXT:  %0 = "tf.Less"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i1>
-// CHECK-NEXT:  %1 = "tf.If"(%0, %arg0) {else_branch = @tensorListIfCondFalse, is_stateless = true, then_branch = @tensorListIfCondTrue} : (tensor<i1>, tensor<3x10xf32>) -> tensor<3x10xf32>
+// CHECK-NEXT:  %1 = "tf.If"(%0, %arg0) <{else_branch = @tensorListIfCondFalse, is_stateless = true, then_branch = @tensorListIfCondTrue}> : (tensor<i1>, tensor<3x10xf32>) -> tensor<3x10xf32>
 // CHECK-NEXT:  return %1 : tensor<3x10xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir
new file mode 100644
index 00000000000000..f53f3954f14211
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir
@@ -0,0 +1,11 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+module attributes {tfl.metadata_buffer = [3 : i32, 7 : i32]} {
+  func.func @main(%arg0: tensor<i32>, %arg1: tensor<3x2xi32>) -> tensor<3x2xi32> {
+    %0 = "tfl.add" (%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+    func.return %0 : tensor<3x2xi32>
+  }
+}
+
+// CHECK: metadata_buffer: [ 3, 7 ],
+// CHECK-NEXT: metadata:
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 7aac8662a8373a..27d98c7599c93d 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -194,24 +194,24 @@ func.func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: t
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 }
@@ -240,32 +240,32 @@ func.func @inference_standard_indy_lstm_time_major(%arg0: tensor<8x8x8xf32>, %ar
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<1> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
-// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -290,24 +290,24 @@ func.func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) <{begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -337,32 +337,32 @@ func.func @inference_standard_indy_lstm_non_time_major(%arg0: tensor<8x8x8xf32>,
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<1> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
-// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) <{begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -389,24 +389,24 @@ func.func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_10:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_12:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_15:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_15:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_17:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_15]], [[VAL_16]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -438,32 +438,32 @@ func.func @inference_standard_indy_lstm_time_major_go_backwards(%arg0: tensor<8x
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<1> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
-// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -490,24 +490,24 @@ func.func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_10:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_12:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_15:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_15:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_17:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_15]], [[VAL_16]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) <{begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -539,32 +539,32 @@ func.func @inference_standard_indy_lstm_non_time_major_go_backwards(%arg0: tenso
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<1> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
-// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
-// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) <{begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64}> : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -596,24 +596,24 @@ func.func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>,
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -646,24 +646,24 @@ func.func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
-// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
-// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_25:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_27:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<f32>
 // CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:         }
 
@@ -684,13 +684,13 @@ func.func @inference_standard_lstm_with_mask(%arg0: tensor<?x8x8xf32>, %arg1: te
 }
 
 // CHECK:       func @inference_standard_lstm_with_mask([[ARG_0:%.*]]: tensor<?x8x8xf32>, [[ARG_1:%.*]]: tensor<8x10xf32>, [[ARG_2:%.*]]: tensor<8x10xf32>, [[ARG_3:%.*]]: tensor<8x40xf32>, [[ARG_4:%.*]]: tensor<10x40xf32>, [[ARG_5:%.*]]: tensor<40xf32>,  [[ARG_6:%.*]]: tensor<?x8xi1>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false", "tfshape$dim { size: -1 } dim { size: 8 }"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
-// CHECK:         [[VAL_0:%.*]] = "tf.BatchMatMulV2"([[ARG_0]], [[ARG_3]]) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+// CHECK:         [[VAL_0:%.*]] = "tf.BatchMatMulV2"([[ARG_0]], [[ARG_3]]) <{adj_x = false, adj_y = false}> : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
 // CHECK:         [[VAL_1:%.*]] = "tf.Add"([[VAL_0]], [[ARG_5]]) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
-// CHECK:         [[VAL_2:%.*]] = "tf.BatchMatMulV2"([[VAL_1]], [[ARG_4]]) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+// CHECK:         [[VAL_2:%.*]] = "tf.BatchMatMulV2"([[VAL_1]], [[ARG_4]]) <{adj_x = false, adj_y = true}> : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
 // CHECK:         [[VAL_3:%.*]] = "tf.Add"([[VAL_2]], [[ARG_1]]) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:         [[VAL_4:%.*]] = "tf.Add"([[VAL_2]], [[ARG_2]]) : (tensor<?x8x10xf32>, tensor<8x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:         [[VAL_5:%.*]] = "tf.Add"([[ARG_1]], [[ARG_2]]) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
-// CHECK:         [[VAL_6:%.*]] = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:         [[VAL_6:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32} : () -> tensor<f32>
 // CHECK:         return [[VAL_5]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_6]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
 // CHECK:       }
 
@@ -718,13 +718,13 @@ func.func @inference_standard_lstm_time_major_cannot_fuse(%arg0: tensor<?x8x8xf3
 }
 
 // CHECK:        func @inference_standard_lstm_time_major_cannot_fuse([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
-// CHECK:           [[VAL_6:%.*]] = "tf.BatchMatMulV2"([[VAL_0]], [[VAL_3]]) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+// CHECK:           [[VAL_6:%.*]] = "tf.BatchMatMulV2"([[VAL_0]], [[VAL_3]]) <{adj_x = false, adj_y = false}> : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
 // CHECK:           [[VAL_7:%.*]] = "tf.Add"([[VAL_6]], [[VAL_5]]) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
-// CHECK:           [[VAL_8:%.*]] = "tf.BatchMatMulV2"([[VAL_7]], [[VAL_4]]) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_8:%.*]] = "tf.BatchMatMulV2"([[VAL_7]], [[VAL_4]]) <{adj_x = false, adj_y = true}> : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Add"([[VAL_8]], [[VAL_1]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_10:%.*]] = "tf.Add"([[VAL_8]], [[VAL_2]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:           [[VAL_11:%.*]] = "tf.Add"([[VAL_1]], [[VAL_2]]) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
-// CHECK:           [[VAL_12:%.*]] = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           [[VAL_12:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32} : () -> tensor<f32>
 // CHECK:           return [[VAL_11]], [[VAL_10]], [[VAL_11]], [[VAL_11]], [[VAL_12]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:         }
 }
@@ -745,13 +745,13 @@ func.func @dynamic_shape_non_fuse_standard_lstm(%arg0: tensor<?x8x8xf32>, %arg1:
 }
 
 // CHECK: func @dynamic_shape_non_fuse_standard_lstm(%[[VAL_0:.*]]: tensor<?x8x8xf32>, %[[VAL_1:.*]]: tensor<?x10xf32>, %[[VAL_2:.*]]: tensor<?x10xf32>, %[[VAL_3:.*]]: tensor<8x40xf32>, %[[VAL_4:.*]]: tensor<10x40xf32>, %[[VAL_5:.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
-// CHECK:         %[[VAL_6:.*]] = "tf.BatchMatMulV2"(%[[VAL_0]], %[[VAL_3]]) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+// CHECK:         %[[VAL_6:.*]] = "tf.BatchMatMulV2"(%[[VAL_0]], %[[VAL_3]]) <{adj_x = false, adj_y = false}> : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
 // CHECK:         %[[VAL_7:.*]] = "tf.Add"(%[[VAL_6]], %[[VAL_5]]) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
-// CHECK:         %[[VAL_8:.*]] = "tf.BatchMatMulV2"(%[[VAL_7]], %[[VAL_4]]) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+// CHECK:         %[[VAL_8:.*]] = "tf.BatchMatMulV2"(%[[VAL_7]], %[[VAL_4]]) <{adj_x = false, adj_y = true}> : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
 // CHECK:         %[[VAL_9:.*]] = "tf.Add"(%[[VAL_8]], %[[VAL_1]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:         %[[VAL_10:.*]] = "tf.Add"(%[[VAL_8]], %[[VAL_2]]) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
 // CHECK:         %[[VAL_11:.*]] = "tf.Add"(%[[VAL_1]], %[[VAL_2]]) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
-// CHECK:         %[[VAL_12:.*]] = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:         %[[VAL_12:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32} : () -> tensor<f32>
 // CHECK:         return %[[VAL_11]], %[[VAL_10]], %[[VAL_11]], %[[VAL_11]], %[[VAL_12]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:       }
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
index 9a865f08464772..dca4c21766ee4a 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
@@ -40,7 +40,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>} : (tensor<8xf32>) -> tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>
   func.return %1 : tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>
 
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 3 : i64}
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 3 : i64}>
 // CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
 // CHECK:  return %1
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
index 0c5bdac0a0b792..c65cecc188f468 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
@@ -39,7 +39,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<8xf32>) -> tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>
   func.return %1 : tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>
 
-// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {narrow_range = false, num_bits = 5 : i64}
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 5 : i64}>
 // CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
 // CHECK:  return %1
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 4f3914265b4da5..fff00820ce353b 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -30,7 +30,7 @@ func.func @conv(tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<256x3x32x3
 // CHECK:  %5 = "tf.Pad"(%arg0, %[[CONSTANT1]]) : (tensor<256x32x32x3xf32>, tensor<4x2xi32>) -> tensor<*xf32>
 // CHECK:  %6 = "tf.Transpose"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
 // CHECK:  %7 = "tfl.conv_2d"(%5, %6, %[[CONSTANT]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<*xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
-// CHECK:  %8 = "tf.Conv2D"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [2, 1, 1, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
+// CHECK:  %8 = "tf.Conv2D"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [2, 1, 1, 1]}> {T = "tfdtype$DT_FLOAT"} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
 }
 
 func.func @depthwiseConv2D(tensor<256x32x32x3xf32>, tensor<3x3x3x4xf32>, tensor<256x3x32x32xf32>) -> (tensor<256x30x30x12xf32>, tensor<256x12x30x30xf32>, tensor<256x30x30x12xf32>, tensor<256x30x30x12xf32>) {
@@ -224,9 +224,9 @@ func.func @matmulNoTransposeAOrB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1
   func.return %166 : tensor<1x1000xf32>
 
   // CHECK-LABEL: matmulNoTransposeAOrB
-  // CHECK: %[[RES:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[RES:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: %[[TRANS:.*]] = "tf.Transpose"(%arg1, %[[RES]]) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MM:.*]] = "tf.MatMul"(%arg0, %[[TRANS]]) {transpose_a = false, transpose_b = true} : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %[[MM:.*]] = "tf.MatMul"(%arg0, %[[TRANS]]) <{transpose_a = false, transpose_b = true}> : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
   // CHECK: return %[[MM]] : tensor<1x1000xf32>
  }
 
@@ -235,10 +235,10 @@ func.func @matmulNoTransposeB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000
   func.return %166 : tensor<1x1000xf32>
 
   // CHECK-LABEL: matmulNoTransposeB
-  // CHECK: %[[RES:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[RES:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: %[[TRANS1:.*]] = "tf.Transpose"(%arg0, %[[RES]]) : (tensor<1x1280xf32>, tensor<?xi32>) -> tensor<*xf32>
   // CHECK: %[[TRANS2:.*]] = "tf.Transpose"(%arg1, %[[RES]]) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MM:.*]] = "tf.MatMul"(%[[TRANS1]], %[[TRANS2]]) {transpose_a = false, transpose_b = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %[[MM:.*]] = "tf.MatMul"(%[[TRANS1]], %[[TRANS2]]) <{transpose_a = false, transpose_b = true}> : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
   // CHECK: return %[[MM]] : tensor<1x1000xf32>
 
 }
@@ -284,7 +284,7 @@ func.func @StridedSliceEllipsisMaskBefore(%arg0: tensor<21x15x7xf32>) -> tensor<
 
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : tensor<3xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1> : tensor<3xi32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) {begin_mask = 3 : i64, ellipsis_mask = 0 : i64, end_mask = 3 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<21x15x2xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) <{begin_mask = 3 : i64, ellipsis_mask = 0 : i64, end_mask = 3 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<21x15x2xf32>
 }
 
 // CHECK-LABEL: @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask
@@ -298,7 +298,7 @@ func.func @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask(%arg0: tensor<4x5x4
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[0, 1, 0]> : tensor<3xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : tensor<3xi32>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<1> : tensor<3xi32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST_0]], %[[CST_1]]) {begin_mask = 7 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST_0]], %[[CST_1]]) <{begin_mask = 7 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
 }
 
 // CHECK-LABEL: @StridedSliceEllipsisMaskAfter
@@ -310,7 +310,7 @@ func.func @StridedSliceEllipsisMaskAfter(%arg0: tensor<21x15x7xf32>) -> tensor<5
 
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : tensor<3xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1> : tensor<3xi32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<5x15x7xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<5x15x7xf32>
 }
 
 // CHECK-LABEL: @NoStridedSliceEllipsisMask
@@ -322,7 +322,7 @@ func.func @NoStridedSliceEllipsisMask(%arg0: tensor<*xf32>) -> tensor<21x15x2xf3
 
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : tensor<2xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1> : tensor<2xi32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) {begin_mask = 0 : i64, ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<*xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<21x15x2xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) <{begin_mask = 0 : i64, ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<*xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<21x15x2xf32>
 }
 
 // CHECK-LABEL: @NoPadStridedSliceNonNewAxisMask
@@ -334,7 +334,7 @@ func.func @NoPadStridedSliceNonNewAxisMask(%arg0: tensor<1x2x3x1xf32>) -> tensor
 
   // CHECK-DAG: %cst = arith.constant dense<0> : tensor<4xi32>
   // CHECK-DAG: %cst_0 = arith.constant dense<1> : tensor<4xi32>
-  // CHECK: %0 = "tf.StridedSlice"(%arg0, %cst, %cst, %cst_0) {begin_mask = 15 : i64, ellipsis_mask = 0 : i64, end_mask = 15 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1x2x3x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
+  // CHECK: %0 = "tf.StridedSlice"(%arg0, %cst, %cst, %cst_0) <{begin_mask = 15 : i64, ellipsis_mask = 0 : i64, end_mask = 15 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<1x2x3x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
 }
 
 // CHECK-LABEL: @PadStridedSliceNewAxisMask1
@@ -348,7 +348,7 @@ func.func @PadStridedSliceNewAxisMask1(%arg0: tensor<2x3xf32>) -> tensor<1x2x3x1
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<1> : tensor<4xi32>
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[1, 2, 3, 1]> : tensor<4xi32>
   // CHECK: %0 = "tf.Reshape"(%arg0, %[[cst_1]]) : (tensor<2x3xf32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
-  // CHECK: %1 = "tf.StridedSlice"(%0, %[[CST0]], %[[CST0]], %[[CST1]]) {begin_mask = 15 : i64, ellipsis_mask = 0 : i64, end_mask = 15 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1x2x3x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
+  // CHECK: %1 = "tf.StridedSlice"(%0, %[[CST0]], %[[CST0]], %[[CST1]]) <{begin_mask = 15 : i64, ellipsis_mask = 0 : i64, end_mask = 15 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<1x2x3x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x3x1xf32>
 }
 
 // CHECK-LABEL: @PadStridedSliceNewAxisMask2
@@ -401,7 +401,7 @@ func.func @strided_slice_with_constant_attributes(%arg0: tensor<10x10x10xf32>, %
   // CHECK-DAG: [[BEGIN:%cst.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
   // CHECK-DAG: [[END:%cst.*]] = arith.constant dense<[0, 10, 10]> : tensor<3xi32>
   // CHECK-DAG: [[STRIDES:%cst.*]] = arith.constant dense<1> : tensor<3xi32>
-  // CHECK-NEXT: "tf.StridedSlice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: "tf.StridedSlice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) <{begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<10x10xf32>
 }
 
 // CHECK-LABEL: @StridedSliceEllipsisAndNewAxisMaskBothSet
@@ -419,7 +419,7 @@ func.func @StridedSliceEllipsisAndNewAxisMaskBothSet(%arg0: tensor<6x7x8xf32>) -
   // CHECK-DAG: %[[STEP:.*]] = arith.constant dense<1> : tensor<5xi32>
   // CHECK-DAG: %[[NEW_DIMS:.*]] = arith.constant dense<[6, 1, 7, 8, 1]> : tensor<5xi32>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[NEW_DIMS]]) : (tensor<6x7x8xf32>, tensor<5xi32>) -> tensor<6x1x7x8x1xf32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%[[RESHAPE]], %[[BEGIN]], %[[END]], %[[STEP]]) {begin_mask = 30 : i64, ellipsis_mask = 0 : i64, end_mask = 30 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<6x1x7x8x1xf32>, tensor<5xi32>, tensor<5xi32>, tensor<5xi32>) -> tensor<2x1x7x8x1xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%[[RESHAPE]], %[[BEGIN]], %[[END]], %[[STEP]]) <{begin_mask = 30 : i64, ellipsis_mask = 0 : i64, end_mask = 30 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<6x1x7x8x1xf32>, tensor<5xi32>, tensor<5xi32>, tensor<5xi32>) -> tensor<2x1x7x8x1xf32>
 }
 
 // CHECK-LABEL: @StridedSliceShrinkAxisAndNewAxisMaskBothSet
@@ -437,7 +437,7 @@ func.func @StridedSliceShrinkAxisAndNewAxisMaskBothSet(%arg0: tensor<6x7x8xf32>)
   // CHECK-DAG: %[[END:.*]] = arith.constant dense<[2, 3, 4, 5, 8]> : tensor<5xi32>
   // CHECK-DAG: %[[STEP:.*]] = arith.constant dense<1> : tensor<5xi32>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[NEW_DIMS]]) : (tensor<6x7x8xf32>, tensor<5xi32>) -> tensor<6x1x7x1x8xf32>
-  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%[[RESHAPE]], %[[BEGIN]], %[[END]], %[[STEP]]) {begin_mask = 26 : i64, ellipsis_mask = 0 : i64, end_mask = 26 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<6x1x7x1x8xf32>, tensor<5xi32>, tensor<5xi32>, tensor<5xi32>) -> tensor<1x4x1x8xf32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%[[RESHAPE]], %[[BEGIN]], %[[END]], %[[STEP]]) <{begin_mask = 26 : i64, ellipsis_mask = 0 : i64, end_mask = 26 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<6x1x7x1x8xf32>, tensor<5xi32>, tensor<5xi32>, tensor<5xi32>) -> tensor<1x4x1x8xf32>
 }
 
 func.func @broadcast_to_f32_low_dim(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
@@ -572,7 +572,7 @@ func.func @lower_rfft_to_rfft2d(%input: tensor<10x20x30xf32>, %fft_len: tensor<1
 // CHECK:  %[[EXP:.*]] = "tf.ExpandDims"(%arg0, %[[CST]]) : (tensor<10x20x30xf32>, tensor<i32>) -> tensor<10x20x1x30xf32>
 // CHECK:  %[[CON:.*]] = "tf.ConcatV2"(%[[CST0]], %arg1, %[[CST1]]) : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
 // CHECK:  %[[RFF:.*]] = "tf.RFFT2D"(%[[EXP]], %[[CON]]) : (tensor<10x20x1x30xf32>, tensor<2xi32>) -> tensor<10x20x1x30xcomplex<f64>>
-// CHECK:  %[[SQE:.*]] = "tf.Squeeze"(%[[RFF]]) {squeeze_dims = [-2]} : (tensor<10x20x1x30xcomplex<f64>>) -> tensor<10x20x30xcomplex<f64>>
+// CHECK:  %[[SQE:.*]] = "tf.Squeeze"(%[[RFF]]) <{squeeze_dims = [-2]}> : (tensor<10x20x1x30xcomplex<f64>>) -> tensor<10x20x30xcomplex<f64>>
 }
 
 // CHECK-LABEL: xla_gather_to_strided_slice
@@ -585,7 +585,7 @@ func.func @xla_gather_to_strided_slice(%arg0 : tensor<1x9x104x768xf32>) -> tenso
 // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : tensor<4xi64>
 // CHECK-DAG: %[[CST0:.*]] = arith.constant dense<[1, 9, 23, 768]> : tensor<4xi64>
 // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %[[V0:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST0]], %[[CST1]]) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<1x9x104x768xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<*xf32>
+// CHECK: %[[V0:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST0]], %[[CST1]]) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64}> : (tensor<1x9x104x768xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<*xf32>
 // CHECK: return %[[V0]] : tensor<*xf32>
 }
 
@@ -660,9 +660,9 @@ func.func @fused_batch_norm_v3_training(%arg0 : tensor<1x1x6x2xf32>, %arg1 : ten
   // CHECK-LABEL: fused_batch_norm_v3_training
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<1.000000e-03> : tensor<f32>
-  // CHECK:  %[[MEAN:.*]] = "tf.Mean"(%arg0, %[[CST]]) {keep_dims = false} : (tensor<1x1x6x2xf32>, tensor<3xi32>) -> tensor<2xf32>
+  // CHECK:  %[[MEAN:.*]] = "tf.Mean"(%arg0, %[[CST]]) <{keep_dims = false}> : (tensor<1x1x6x2xf32>, tensor<3xi32>) -> tensor<2xf32>
   // CHECK:  %[[SQ:.*]] = "tf.SquaredDifference"(%arg0, %[[MEAN]]) : (tensor<1x1x6x2xf32>, tensor<2xf32>) -> tensor<1x1x6x2xf32>
-  // CHECK:  %[[MEAN0:.*]] = "tf.Mean"(%[[SQ]], %[[CST]]) {keep_dims = false} : (tensor<1x1x6x2xf32>, tensor<3xi32>) -> tensor<2xf32>
+  // CHECK:  %[[MEAN0:.*]] = "tf.Mean"(%[[SQ]], %[[CST]]) <{keep_dims = false}> : (tensor<1x1x6x2xf32>, tensor<3xi32>) -> tensor<2xf32>
   // CHECK:  %[[ADD:.*]] = "tf.Add"(%[[MEAN0]], %[[CST1]]) : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK:  %[[RSQRT:.*]] = "tf.Rsqrt"(%[[ADD]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK:  %[[MUL1:.*]] = "tf.Mul"(%arg1, %[[RSQRT]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -689,10 +689,10 @@ func.func @add_v2_uint32(%arg0: tensor<ui32>, %arg1: tensor<ui32>) -> tensor<ui3
   func.return %0 : tensor<ui32>
 
   // CHECK-LABEL: add_v2_uint32
-  // CHECK:  %[[CAST:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<ui32>) -> tensor<i32>
-  // CHECK:  %[[CAST1:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<ui32>) -> tensor<i32>
+  // CHECK:  %[[CAST:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<ui32>) -> tensor<i32>
+  // CHECK:  %[[CAST1:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<ui32>) -> tensor<i32>
   // CHECK:  %[[ADD:.*]] = "tf.AddV2"(%[[CAST]], %[[CAST1]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK:  %[[CAST2:.*]] = "tf.Cast"(%[[ADD]]) {Truncate = false} : (tensor<i32>) -> tensor<ui32>
+  // CHECK:  %[[CAST2:.*]] = "tf.Cast"(%[[ADD]]) <{Truncate = false}> : (tensor<i32>) -> tensor<ui32>
   // CHECK:  return %[[CAST2]] : tensor<ui32>
 }
 
@@ -713,12 +713,12 @@ func.func @QuantDequantTranspose(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
   func.return %6 : tensor<2x4xf32>
 
   // CHECK-LABEL: QuantDequantTranspose
-  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1.00392163> : tensor<3x4xf32>
   // CHECK: %[[QUANT:.*]] = "tfl.quantize"(%[[CST_0]]) {qtype = tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>
   // CHECK: %[[DEQUANT:.*]] = "tfl.dequantize"(%[[QUANT]]) : (tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>) -> tensor<3x4xf32>
   // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[DEQUANT]], %[[CST]]) : (tensor<3x4xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[TRANSPOSE]]) {transpose_a = false, transpose_b = true} : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<2x4xf32>
+  // CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[TRANSPOSE]]) <{transpose_a = false, transpose_b = true}> : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<2x4xf32>
   // CHECK: return %[[MATMUL]] : tensor<2x4xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir b/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir
index e72f421ad9eafb..477315d696783c 100644
--- a/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/raise-custom-ops.mlir
@@ -42,11 +42,11 @@ func.func @tf_executor_wrapper(%arg0: tensor<*xf32>) -> tensor<*xf32> attributes
 // CHECK: tf_executor.island wraps "tf.FakeQuantWithMinMaxVarsPerChannel"
 
 // WRAPPED-NEXT: tf_executor.graph {
-// WRAPPED-NEXT:   tf_executor.island wraps "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<186xf32>} : () -> tensor<186xf32>
-// WRAPPED-NEXT:   tf_executor.island wraps "tf.Const"() {device = "", value = dense<2.000000e+00> : tensor<186xf32>} : () -> tensor<186xf32>
+// WRAPPED-NEXT:   tf_executor.island wraps "tf.Const"() <{value = dense<1.000000e+00> : tensor<186xf32>}> {device = ""} : () -> tensor<186xf32>
+// WRAPPED-NEXT:   tf_executor.island wraps "tf.Const"() <{value = dense<2.000000e+00> : tensor<186xf32>}> {device = ""} : () -> tensor<186xf32>
 // WRAPPED-NEXT:   tf_executor.island wraps "tfl.custom_tf"
 // WRAPPED-NEXT:     ^bb0(%arg1: tensor<*xf32>, %arg2: tensor<186xf32>, %arg3: tensor<186xf32>):
-// WRAPPED-NEXT:   %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg1, %arg2, %arg3) {device = "", narrow_range = true, num_bits = 8 : i64} : (tensor<*xf32>, tensor<186xf32>, tensor<186xf32>) -> tensor<*xf32>
+// WRAPPED-NEXT:   %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg1, %arg2, %arg3) <{narrow_range = true, num_bits = 8 : i64}> {device = ""} : (tensor<*xf32>, tensor<186xf32>, tensor<186xf32>) -> tensor<*xf32>
 // WRAPPED-NEXT:   "tfl.yield"(%[[fq]]) : (tensor<*xf32>) -> ()
 // WRAPPED-NEXT:   }) {device = "", narrow_range = true, num_bits = 8 : i64} : (tensor<*xf32>, tensor<186xf32>, tensor<186xf32>) -> tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index 9563f5d93226fb..f8d1de04649169 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/match.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -111,20 +112,6 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
     Operation* tf_op = rewriter.create(op_state);
     rewriter.replaceOp(op, tf_op->getResults());
 
-    if (isa<TF::MapDatasetOp, TF::ReduceDatasetOp>(tf_op)) {
-      constexpr StringRef kFuncAttrName = "f";
-      tf_op->setAttr(
-          kFuncAttrName,
-          tf_op->getAttr(kFuncAttrName).cast<TF::FuncAttr>().getName());
-    }
-
-    if (isa<TF::TakeWhileDatasetOp>(tf_op)) {
-      constexpr StringRef kFuncAttrName = "predicate";
-      tf_op->setAttr(
-          kFuncAttrName,
-          tf_op->getAttr(kFuncAttrName).cast<TF::FuncAttr>().getName());
-    }
-
     // Special type fixes for TF Resource Tensors that are casted to
     // Int32 tensor during MLIR->TFLite flatbuffer conversion.
     // TODO(b/146131919): correct handling of resource type
@@ -237,6 +224,10 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
       if (!mlir_attr.ok()) {
         return emitError(loc, mlir_attr.status().message());
       }
+      if (absl::StrContains(op_name, "Dataset") &&
+          mlir_attr->isa<TF::FuncAttr>()) {
+        mlir_attr = mlir_attr->cast<TF::FuncAttr>().getName();
+      }
       attributes.push_back(builder.getNamedAttr(attr_name, *mlir_attr));
     }
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index a3210f64239403..ea2d77a865dcec 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -32,8 +32,10 @@ cc_library(
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
+        "passes/post_quantize.cc",
         "passes/prepare_quantize.cc",
         "passes/quantize.cc",
+        "passes/quantize_composite_functions.cc",
         "passes/quantize_weight.cc",
         "passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc",
         "passes/restore_function_name.cc",
@@ -51,6 +53,7 @@ cc_library(
         ":quantization_options_proto_cc",
         ":stablehlo_passes_inc_gen",
         ":stablehlo_type_utils",
+        ":uniform_quantized_types",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
@@ -58,6 +61,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -185,7 +189,6 @@ cc_library(
     ],
     deps = [
         ":bridge_passes_inc_gen",
-        ":math_utils",
         ":tf_type_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
@@ -210,7 +213,6 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:xla_data_proto_cc",
@@ -238,9 +240,12 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels/uniform_quant_ops:kernels",
         "//tensorflow/core/ops",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 505554bdad4bb0..16af4b212ecfe8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstdlib>
-#include <cstring>
 #include <memory>
 #include <optional>
-#include <string>
-#include <string_view>
 #include <utility>
+#include <variant>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -31,12 +29,11 @@ limitations under the License.
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -49,7 +46,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
@@ -57,88 +53,193 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
-#define GEN_PASS_DEF_CONVERTMHLOQUANTTOINT
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h.inc"
-
-// This helper function create ops to requantize `input` tensor and output to
-// `res_int32` tensor. Clamping is omitted because for some ops clamping can be
-// done later to avoid duplicate.
-LogicalResult RequantizeWithoutClamping(
-    mlir::OpState op, Value input, TensorType int32_tensor_type,
-    quant::UniformQuantizedType input_quantized_type,
-    quant::UniformQuantizedType result_quantized_type, Value &res_int32,
-    ConversionPatternRewriter &rewriter) {
+// This helper function create ops to requantize `input` tensor and returns the
+// output tensor. Clamping is done if output integer bit-width < 32.
+//
+// Requantization is essentially dequantize --> quantize.
+//
+// Dequantize: (input - zp) * scale
+// Quantize: input / scale + zp
+//
+// Hence,
+//   output = (input - input_zp) * input_scale / output_scale + output_zp
+//
+// This is simplified as:
+//   output = input * merged_scale + merged_zp
+// where:
+//   merged_zp = output_zp - input_zp * merged_scale.
+//   merged_scale = input_scale / output_scale.
+Value Requantize(mlir::OpState op, Value input,
+                 UniformQuantizedType input_quantized_type,
+                 UniformQuantizedType output_quantized_type,
+                 TensorType output_tensor_type,
+                 ConversionPatternRewriter &rewriter) {
   // Skip requantization when input and result have the same type.
-  if (input_quantized_type == result_quantized_type) {
-    res_int32 = rewriter.create<mhlo::ConvertOp>(op->getLoc(),
-                                                 int32_tensor_type, input);
-    return success();
+  if (input_quantized_type == output_quantized_type) {
+    return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
+                                            input);
   }
 
-  // Convert input to int32 tensor.
-  res_int32 =
-      rewriter.create<mhlo::ConvertOp>(op->getLoc(), int32_tensor_type, input);
-  // Undo the input zero point.
-  Value input_zero_point = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                        input_quantized_type.getZeroPoint())));
-  res_int32 = rewriter.create<chlo::BroadcastSubOp>(
-      op->getLoc(), int32_tensor_type, res_int32, input_zero_point, nullptr);
-
-  // Adjust the scale.
-  const double effective_scale =
-      input_quantized_type.getScale() / result_quantized_type.getScale();
-  int32_t effective_quantized_fraction;
-  int32_t effective_shift;
-  if (failed(quant::stablehlo::QuantizeMultiplier(
-          effective_scale, effective_quantized_fraction, effective_shift))) {
-    op->emitError("Invalid effective quantization scale.");
+  double merged_scale_fp =
+      input_quantized_type.getScale() / output_quantized_type.getScale();
+  Value merged_scale = rewriter.create<mhlo::ConstantOp>(
+      op->getLoc(),
+      rewriter.getF32FloatAttr(static_cast<float>(merged_scale_fp)));
+
+  auto float_tensor_type =
+      input.getType().cast<TensorType>().clone(rewriter.getF32Type());
+  Value output_float =
+      rewriter.create<mhlo::ConvertOp>(op->getLoc(), float_tensor_type, input);
+
+  output_float = rewriter.create<chlo::BroadcastMulOp>(
+      op->getLoc(), float_tensor_type, output_float, merged_scale, nullptr);
+
+  // Add merged_zp only when it is non-zero.
+  double merged_zp_fp = output_quantized_type.getZeroPoint() -
+                        input_quantized_type.getZeroPoint() * merged_scale_fp;
+  if (merged_zp_fp != 0) {
+    Value merged_zp = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(),
+        rewriter.getF32FloatAttr(static_cast<float>(merged_zp_fp)));
+    output_float = rewriter.create<chlo::BroadcastAddOp>(
+        op->getLoc(), float_tensor_type, output_float, merged_zp, nullptr);
+  }
+
+  // Clamp output if the output integer bit-width <32.
+  if (output_tensor_type.getElementType().cast<IntegerType>().getWidth() < 32) {
+    Value quantization_min = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
+                          output_quantized_type.getStorageTypeMin())));
+    Value quantization_max = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
+                          output_quantized_type.getStorageTypeMax())));
+    // Clamp results by [quantization_min, quantization_max].
+    output_float = rewriter.create<mhlo::ClampOp>(
+        op->getLoc(), float_tensor_type, quantization_min, output_float,
+        quantization_max);
+  }
+
+  output_float = rewriter.create<mhlo::RoundNearestEvenOp>(
+      op->getLoc(), float_tensor_type, output_float);
+  return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
+                                          output_float);
+}
+
+using QuantType =
+    std::variant<UniformQuantizedType, UniformQuantizedPerAxisType>;
+FailureOr<QuantType> GetQuantType(Type type) {
+  if (auto quant_type =
+          getElementTypeOrSelf(type).dyn_cast<UniformQuantizedType>()) {
+    return QuantType(quant_type);
+  } else if (auto quant_type = getElementTypeOrSelf(type)
+                                   .dyn_cast<UniformQuantizedPerAxisType>()) {
+    return QuantType(quant_type);
+  } else {
     return failure();
   }
-  Value multiplier = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(), rewriter.getI32IntegerAttr(
-                        static_cast<int32_t>(effective_quantized_fraction)));
-  // The effective_quantized_fraction value has been quantized by multiplying
-  // (1 << 15).  So, we have to shift it back by (15 - effective_shift) to get
-  // the desired outcome.
-  Value total_shift = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(),
-      rewriter.getI32IntegerAttr(static_cast<int32_t>(15 - effective_shift)));
-
-  // Apply the effective scale with rounding.
-  Value half = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(), rewriter.getI32IntegerAttr(
-                        static_cast<int32_t>(1 << (14 - effective_shift))));
-  res_int32 = rewriter.create<chlo::BroadcastMulOp>(
-      op->getLoc(), int32_tensor_type, res_int32, multiplier, nullptr);
-  res_int32 = rewriter.create<chlo::BroadcastAddOp>(
-      op->getLoc(), int32_tensor_type, res_int32, half, nullptr);
-  res_int32 = rewriter.create<chlo::BroadcastShiftRightArithmeticOp>(
-      op->getLoc(), int32_tensor_type, res_int32, total_shift, nullptr);
-
-  // Apply the output zero point.
-  Value output_zero_point = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                        result_quantized_type.getZeroPoint())));
-  res_int32 = rewriter.create<chlo::BroadcastAddOp>(
-      op->getLoc(), int32_tensor_type, res_int32, output_zero_point, nullptr);
+}
 
-  return success();
+// Extract scale and zero point info from input quant type info.
+void GetQuantizationParams(OpBuilder &builder, Location loc,
+                           QuantType quant_type, Value &scales,
+                           Value &zero_points, bool output_zero_point_in_fp,
+                           DenseIntElementsAttr &broadcast_dims) {
+  // Get scales/zero points for per-tensor and per-axis quantization cases.
+  if (auto *quant_per_tensor_type =
+          std::get_if<UniformQuantizedType>(&quant_type)) {
+    scales = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(quant_per_tensor_type->getScale()));
+    if (output_zero_point_in_fp) {
+      zero_points = builder.create<mhlo::ConstantOp>(
+          loc, builder.getF32FloatAttr(
+                   static_cast<float>(quant_per_tensor_type->getZeroPoint())));
+    } else {
+      zero_points = builder.create<mhlo::ConstantOp>(
+          loc, builder.getI32IntegerAttr(static_cast<int32_t>(
+                   quant_per_tensor_type->getZeroPoint())));
+    }
+  } else {
+    auto &quant_per_channel_type =
+        std::get<UniformQuantizedPerAxisType>(quant_type);
+    llvm::SmallVector<float> scales_vec;
+    for (auto scale : quant_per_channel_type.getScales())
+      scales_vec.push_back(scale);
+    scales = builder.create<mhlo::ConstantOp>(
+        loc, DenseFPElementsAttr::get(
+                 RankedTensorType::get(
+                     {static_cast<int64_t>(
+                         quant_per_channel_type.getScales().size())},
+                     builder.getF32Type()),
+                 scales_vec));
+    if (output_zero_point_in_fp) {
+      llvm::SmallVector<float> zero_points_vec;
+      for (auto zero_point : quant_per_channel_type.getZeroPoints())
+        zero_points_vec.push_back(zero_point);
+      zero_points = builder.create<mhlo::ConstantOp>(
+          loc, DenseFPElementsAttr::get(
+                   RankedTensorType::get(
+                       {static_cast<int64_t>(
+                           quant_per_channel_type.getZeroPoints().size())},
+                       builder.getF32Type()),
+                   zero_points_vec));
+    } else {
+      llvm::SmallVector<int32_t> zero_points_vec;
+      for (auto zero_point : quant_per_channel_type.getZeroPoints())
+        zero_points_vec.push_back(zero_point);
+      zero_points = builder.create<mhlo::ConstantOp>(
+          loc, DenseIntElementsAttr::get(
+                   RankedTensorType::get(
+                       {static_cast<int64_t>(
+                           quant_per_channel_type.getZeroPoints().size())},
+                       builder.getI32Type()),
+                   zero_points_vec));
+    }
+    broadcast_dims = DenseIntElementsAttr::get(
+        RankedTensorType::get({1}, builder.getI64Type()),
+        {static_cast<int64_t>(quant_per_channel_type.getQuantizedDimension())});
+  }
 }
 
-class ConvertMHLOQuantToInt
-    : public impl::ConvertMHLOQuantToIntBase<ConvertMHLOQuantToInt> {
- public:
-  ConvertMHLOQuantToInt() = default;
-  ConvertMHLOQuantToInt(const ConvertMHLOQuantToInt &) {}
+// Extract storage min/max from input quant type info.
+void GetQuantizationStorageInfo(OpBuilder &builder, Location loc,
+                                QuantType quant_type, Value &storage_min,
+                                Value &storage_max) {
+  if (auto *quant_per_tensor_type =
+          std::get_if<UniformQuantizedType>(&quant_type)) {
+    storage_min = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(static_cast<float>(
+                 quant_per_tensor_type->getStorageTypeMin())));
+    storage_max = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(static_cast<float>(
+                 quant_per_tensor_type->getStorageTypeMax())));
+  } else {
+    auto &quant_per_channel_type =
+        std::get<UniformQuantizedPerAxisType>(quant_type);
+    storage_min = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(static_cast<float>(
+                 quant_per_channel_type.getStorageTypeMin())));
+    storage_max = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(static_cast<float>(
+                 quant_per_channel_type.getStorageTypeMax())));
+  }
+}
 
-  explicit ConvertMHLOQuantToInt(bool legalize_chlo) {
-    legalize_chlo_ = legalize_chlo;
+// Get storage type of a UQ type. Return original type if it is no UQ type.
+Type GetQuantStorageType(Type type) {
+  if (auto shaped = type.dyn_cast<ShapedType>()) {
+    return shaped.clone(GetQuantStorageType(shaped.getElementType()));
   }
 
-  // Performs conversion of MHLO quant ops to primitive ops.
-  void runOnOperation() override;
-};
+  if (auto element_type =
+          getElementTypeOrSelf(type).dyn_cast<UniformQuantizedType>()) {
+    return element_type.getStorageType();
+  } else if (auto element_type = getElementTypeOrSelf(type)
+                                     .dyn_cast<UniformQuantizedPerAxisType>()) {
+    return element_type.getStorageType();
+  } else {
+    return type;
+  }
+}
 
 class ConvertUniformQuantizeOp
     : public OpConversionPattern<mhlo::UniformQuantizeOp> {
@@ -148,124 +249,66 @@ class ConvertUniformQuantizeOp
   LogicalResult matchAndRewrite(
       mhlo::UniformQuantizeOp op, mhlo::UniformQuantizeOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    auto quantized_type = getElementTypeOrSelf(op.getResult().getType())
-                              .dyn_cast<quant::UniformQuantizedType>();
-    // Currently for activation, PTQ supports per-tensor quantization only, and
-    // UniformQuantize op is only for activation.
-    if (!quantized_type) {
-      return rewriter.notifyMatchFailure(
-          op, "Legalization supports only per-tensor quantization.");
-    }
     auto input_element_type = getElementTypeOrSelf(op.getOperand().getType());
     if (input_element_type.isF32()) {
-      return matchAndRewriteQuantize(op, adaptor, rewriter, quantized_type);
-    } else if (input_element_type.isa<quant::UniformQuantizedType>()) {
-      return matchAndRewriteRequantize(op, adaptor, rewriter, quantized_type);
+      auto quant_type = GetQuantType(op.getResult().getType());
+      if (succeeded(quant_type)) {
+        return matchAndRewriteQuantize(op, adaptor, rewriter, *quant_type);
+      }
+    } else if (input_element_type.isa<UniformQuantizedType>()) {
+      return matchAndRewriteRequantize(op, adaptor, rewriter);
     }
     return rewriter.notifyMatchFailure(op, "Unsupported input element type.");
   }
 
-  LogicalResult matchAndRewriteQuantize(
-      mhlo::UniformQuantizeOp op, mhlo::UniformQuantizeOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter,
-      const quant::UniformQuantizedType &quantized_type) const {
-    Value scale = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(quantized_type.getScale()));
-    Value zero_point = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(
-                          static_cast<float>(quantized_type.getZeroPoint())));
-    Value quantization_min = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          quantized_type.getStorageTypeMin())));
-    Value quantization_max = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          quantized_type.getStorageTypeMax())));
+  LogicalResult matchAndRewriteQuantize(mhlo::UniformQuantizeOp op,
+                                        mhlo::UniformQuantizeOpAdaptor adaptor,
+                                        ConversionPatternRewriter &rewriter,
+                                        QuantType quant_type) const {
+    Value scales, zero_points;
+    DenseIntElementsAttr broadcast_dims;
+    GetQuantizationParams(rewriter, op->getLoc(), quant_type, scales,
+                          zero_points, /*output_zero_point_in_fp=*/true,
+                          broadcast_dims);
+
+    Value quantization_min, quantization_max;
+    GetQuantizationStorageInfo(rewriter, op->getLoc(), quant_type,
+                               quantization_min, quantization_max);
 
     auto res_float_tensor_type =
         op.getOperand().getType().clone(rewriter.getF32Type());
     Value res_float = rewriter.create<chlo::BroadcastDivOp>(
-        op->getLoc(), res_float_tensor_type, adaptor.getOperand(), scale,
-        nullptr);
+        op->getLoc(), res_float_tensor_type, adaptor.getOperand(), scales,
+        broadcast_dims);
     res_float = rewriter.create<chlo::BroadcastAddOp>(
-        op->getLoc(), res_float_tensor_type, res_float, zero_point, nullptr);
+        op->getLoc(), res_float_tensor_type, res_float, zero_points,
+        broadcast_dims);
 
     res_float = rewriter.create<mhlo::ClampOp>(
         op->getLoc(), res_float_tensor_type, quantization_min, res_float,
         quantization_max);
     res_float = rewriter.create<mhlo::RoundNearestEvenOp>(
         op->getLoc(), res_float_tensor_type, res_float);
-    auto res_final_tensor_type =
-        res_float_tensor_type.clone(quantized_type.getStorageType());
+    auto res_final_tensor_type = res_float_tensor_type.clone(
+        GetQuantStorageType(op.getResult().getType().getElementType()));
     rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(op, res_final_tensor_type,
                                                  res_float);
     return success();
   }
 
-  // Requantization is essentially dequantize --> quantize.
-  //
-  // Dequantize: (input - zp) * scale
-  // Quantize: input / scale + zp
-  //
-  // Hence,
-  //   result = (input - input_zp) * input_scale / output_scale + output_zp
-  //
-  // This is simplified as:
-  //   result = input * merged_scale + merged_zp
-  // where:
-  //   merged_zp = output_zp - input_zp * merged_scale.
-  //   merged_scale = input_scale / output_scale.
   LogicalResult matchAndRewriteRequantize(
       mhlo::UniformQuantizeOp op, mhlo::UniformQuantizeOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter,
-      const quant::UniformQuantizedType &output_quantized_type) const {
+      ConversionPatternRewriter &rewriter) const {
     auto input_quantized_type = getElementTypeOrSelf(op.getOperand().getType())
-                                    .cast<quant::UniformQuantizedType>();
-    auto result_quantized_type = getElementTypeOrSelf(op.getResult().getType())
-                                     .cast<quant::UniformQuantizedType>();
-
-    double merged_scale_fp =
-        input_quantized_type.getScale() / result_quantized_type.getScale();
-    Value merged_scale = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(),
-        rewriter.getF32FloatAttr(static_cast<float>(merged_scale_fp)));
-
-    auto res_float_tensor_type =
-        op.getOperand().getType().clone(rewriter.getF32Type());
-    Value res_float = rewriter.create<mhlo::ConvertOp>(
-        op->getLoc(), res_float_tensor_type, adaptor.getOperand());
-
-    res_float = rewriter.create<chlo::BroadcastMulOp>(
-        op->getLoc(), res_float_tensor_type, res_float, merged_scale, nullptr);
-
-    // Add merged_zp only when it is non-zero.
-    double merged_zp_fp = result_quantized_type.getZeroPoint() -
-                          input_quantized_type.getZeroPoint() * merged_scale_fp;
-    if (merged_zp_fp != 0) {
-      Value merged_zp = rewriter.create<mhlo::ConstantOp>(
-          op->getLoc(),
-          rewriter.getF32FloatAttr(static_cast<float>(merged_zp_fp)));
-      res_float = rewriter.create<chlo::BroadcastAddOp>(
-          op->getLoc(), res_float_tensor_type, res_float, merged_zp, nullptr);
-    }
-
-    Value quantization_min = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          output_quantized_type.getStorageTypeMin())));
-    Value quantization_max = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          output_quantized_type.getStorageTypeMax())));
-
-    // Clamp results by [quantization_min, quantization_max].
-    res_float = rewriter.create<mhlo::ClampOp>(
-        op->getLoc(), res_float_tensor_type, quantization_min, res_float,
-        quantization_max);
-    res_float = rewriter.create<mhlo::RoundNearestEvenOp>(
-        op->getLoc(), res_float_tensor_type, res_float);
-
-    auto res_final_tensor_type =
-        res_float_tensor_type.clone(output_quantized_type.getStorageType());
-    rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(op, res_final_tensor_type,
-                                                 res_float);
+                                    .cast<UniformQuantizedType>();
+    auto output_quantized_type = getElementTypeOrSelf(op.getResult().getType())
+                                     .cast<UniformQuantizedType>();
+    rewriter.replaceOp(
+        op, Requantize(op, adaptor.getOperand(), input_quantized_type,
+                       output_quantized_type,
+                       op.getResult().getType().cast<TensorType>().clone(
+                           output_quantized_type.getStorageType()),
+                       rewriter));
     return success();
   }
 };
@@ -278,19 +321,15 @@ class ConvertUniformDequantizeOp
   LogicalResult matchAndRewrite(
       mhlo::UniformDequantizeOp op, mhlo::UniformDequantizeOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    auto element_type = getElementTypeOrSelf(op.getOperand().getType())
-                            .dyn_cast<quant::UniformQuantizedType>();
-    // Currently for activation, PTQ supports per-tensor quantization only, and
-    // UniformQuantize op is only for activation.
-    if (!element_type) {
-      return rewriter.notifyMatchFailure(
-          op, "Legalization supports only per-tensor quantization.");
+    auto quant_type = GetQuantType(op.getOperand().getType());
+    if (failed(quant_type)) {
+      return failure();
     }
-    Value scale = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(element_type.getScale()));
-    Value zero_point = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getI32IntegerAttr(
-                          static_cast<int32_t>(element_type.getZeroPoint())));
+    Value scales, zero_points;
+    DenseIntElementsAttr broadcast_dims;
+    GetQuantizationParams(rewriter, op->getLoc(), *quant_type, scales,
+                          zero_points,
+                          /*output_zero_point_in_fp=*/false, broadcast_dims);
 
     Value input = adaptor.getOperand();
     // TODO: b/260280919 - Consider avoiding conversion to int32.
@@ -299,13 +338,14 @@ class ConvertUniformDequantizeOp
     Value res_int32 = rewriter.create<mhlo::ConvertOp>(
         op->getLoc(), res_int32_tensor_type, input);
     res_int32 = rewriter.create<chlo::BroadcastSubOp>(
-        op->getLoc(), res_int32_tensor_type, res_int32, zero_point, nullptr);
+        op->getLoc(), res_int32_tensor_type, res_int32, zero_points,
+        broadcast_dims);
     auto res_float_tensor_type =
         res_int32.getType().cast<TensorType>().clone(rewriter.getF32Type());
     Value res_float = rewriter.create<mhlo::ConvertOp>(
         op->getLoc(), res_float_tensor_type, res_int32);
     res_float = rewriter.replaceOpWithNewOp<chlo::BroadcastMulOp>(
-        op, res_float_tensor_type, res_float, scale, nullptr);
+        op, res_float_tensor_type, res_float, scales, broadcast_dims);
     return success();
   }
 };
@@ -317,18 +357,14 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
   LogicalResult matchAndRewrite(
       mhlo::AddOp op, mhlo::AddOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    auto lhs_element_type = op.getLhs()
-                                .getType()
-                                .getElementType()
-                                .dyn_cast<quant::UniformQuantizedType>();
-    auto rhs_element_type = op.getRhs()
-                                .getType()
-                                .getElementType()
-                                .dyn_cast<quant::UniformQuantizedType>();
+    auto lhs_element_type =
+        op.getLhs().getType().getElementType().dyn_cast<UniformQuantizedType>();
+    auto rhs_element_type =
+        op.getRhs().getType().getElementType().dyn_cast<UniformQuantizedType>();
     auto result_element_type = op.getResult()
                                    .getType()
                                    .getElementType()
-                                   .dyn_cast<quant::UniformQuantizedType>();
+                                   .dyn_cast<UniformQuantizedType>();
 
     // We only handle cases where lhs, rhs and results all have quantized
     // element type.
@@ -347,20 +383,14 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
     // be the same as the result.
     // TODO: b/260280919 - Consider avoiding conversion to int32.
     Value lhs = adaptor.getLhs();
-    Value lhs_int32_tensor;
-    if (failed(RequantizeWithoutClamping(op, lhs, res_int32_tensor_type,
-                                         lhs_element_type, result_element_type,
-                                         lhs_int32_tensor, rewriter))) {
-      return failure();
-    }
+    Value lhs_int32_tensor =
+        Requantize(op, lhs, lhs_element_type, result_element_type,
+                   res_int32_tensor_type, rewriter);
 
     Value rhs = adaptor.getRhs();
-    Value rhs_int32_tensor;
-    if (failed(RequantizeWithoutClamping(op, rhs, res_int32_tensor_type,
-                                         rhs_element_type, result_element_type,
-                                         rhs_int32_tensor, rewriter))) {
-      return failure();
-    }
+    Value rhs_int32_tensor =
+        Requantize(op, rhs, rhs_element_type, result_element_type,
+                   res_int32_tensor_type, rewriter);
 
     Value zero_point = rewriter.create<mhlo::ConstantOp>(
         op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
@@ -437,9 +467,9 @@ LogicalResult matchAndRewriteDotLikeHybridOp(
   // result = hybridOp(lhs, dequant(rhs))
   Value lhs_float32_tensor = adaptor.getLhs();
   Value rhs = adaptor.getRhs();
-  quant::UniformQuantizedType rhs_element_type =
+  UniformQuantizedType rhs_element_type =
       getElementTypeOrSelf(op.getRhs().getType())
-          .template cast<quant::UniformQuantizedType>();
+          .template cast<UniformQuantizedType>();
   auto res_float32_tensor_type =
       op.getResult().getType().template cast<TensorType>();
   auto rhs_float32_tensor_type =
@@ -481,7 +511,7 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
 
   // Calculate the output tensor shape. This is input tensor dims minus
   // contracting dims.
-  auto ranked_tensor = tensor.getType().dyn_cast<RankedTensorType>();
+  auto ranked_tensor = tensor.getType().cast<RankedTensorType>();
   llvm::SmallVector<int64_t> output_dims;
   for (int64_t i = 0; i < ranked_tensor.getRank(); ++i) {
     if (absl::c_count(reduction_dims, i) == 0) {
@@ -492,7 +522,7 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
   // Convert input tensor to output type since mhlo::Reduce only supports same
   // element type for input/output.
   tensor = builder.create<mhlo::ConvertOp>(
-      loc, tensor.getType().dyn_cast<TensorType>().clone(output_element_type),
+      loc, tensor.getType().cast<TensorType>().clone(output_element_type),
       tensor);
   auto reducer_tensor_type = RankedTensorType::get({}, output_element_type);
 
@@ -592,7 +622,7 @@ Value BroadcastZpContribution(OpBuilder &builder, Location loc,
   // zero-point-offset tensor to the final output tensor, and then do the
   // broadcast.
   auto zp_contribution_rank =
-      zp_contribution.getType().dyn_cast<ShapedType>().getRank();
+      zp_contribution.getType().cast<ShapedType>().getRank();
   llvm::SmallVector<int64_t> broadcast_dims;
   broadcast_dims.resize(zp_contribution_rank, 0);
   // Result tensor will have batching dims first, then LHS result dims, then
@@ -615,7 +645,7 @@ Value BroadcastZpContribution(OpBuilder &builder, Location loc,
   }
   // Use broadcast_in_dim or dyanmic_broadcast_in_dim based on input shape
   // dynamism.
-  if (zp_contribution.getType().dyn_cast<ShapedType>().hasStaticShape()) {
+  if (zp_contribution.getType().cast<ShapedType>().hasStaticShape()) {
     zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
         loc, output_tensor_type, zp_contribution,
         DenseIntElementsAttr::get(
@@ -742,13 +772,13 @@ Value CreateDotLikeKernel<mhlo::ConvolutionOp>(OpBuilder &builder, Location loc,
         DenseIntElementsAttr::get(
             RankedTensorType::get({}, builder.getI8Type()),
             {static_cast<int8_t>(getElementTypeOrSelf(op.getLhs().getType())
-                                     .dyn_cast<quant::UniformQuantizedType>()
+                                     .cast<UniformQuantizedType>()
                                      .getZeroPoint())}));
     // Convert Padding attributes from mhlo::Convolution to mhlo::Pad. Note that
     // Padding is applied for spatial dimensions [1...rank-1) only for
     // mhlo::Convolution. But mhlo::Pad require those for all dimensions. Hence
     // we add 0 to the beginning and end of the padding vectors.
-    int64_t rank = lhs.getType().dyn_cast<TensorType>().getRank();
+    int64_t rank = lhs.getType().cast<TensorType>().getRank();
     llvm::SmallVector<int64_t> padding_low(rank, 0), padding_high(rank, 0),
         padding_interior(rank, 0);
     for (int64_t i = 1; i < rank - 1; ++i) {
@@ -786,15 +816,12 @@ LogicalResult matchAndRewriteDotLikeOp(DotLikeOp op, DotLikeOpAdaptor adaptor,
                                        ConversionPatternRewriter &rewriter) {
   // Lower Dot/DotGeneral UQ ops to DotGeneral int.
   // Assumes that operands and results are uq types.
-  auto lhs_element_quant_type =
-      getElementTypeOrSelf(op.getLhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
-  auto rhs_element_quant_type =
-      getElementTypeOrSelf(op.getRhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
-  auto res_element_quant_type =
-      getElementTypeOrSelf(op.getResult())
-          .template dyn_cast<quant::UniformQuantizedType>();
+  auto lhs_element_quant_type = getElementTypeOrSelf(op.getLhs().getType())
+                                    .template dyn_cast<UniformQuantizedType>();
+  auto rhs_element_quant_type = getElementTypeOrSelf(op.getRhs().getType())
+                                    .template dyn_cast<UniformQuantizedType>();
+  auto res_element_quant_type = getElementTypeOrSelf(op.getResult())
+                                    .template dyn_cast<UniformQuantizedType>();
   Value lhs = adaptor.getLhs();
   Value rhs = adaptor.getRhs();
   auto res_int32_tensor_type =
@@ -837,8 +864,7 @@ LogicalResult matchAndRewriteDotLikeOp(DotLikeOp op, DotLikeOpAdaptor adaptor,
     // Skip zp_offset if it is 0.
     if (zp_offset) {
       auto zp_offset_float32_tensor_type =
-          zp_offset.getType().dyn_cast<TensorType>().clone(
-              rewriter.getF32Type());
+          zp_offset.getType().cast<TensorType>().clone(rewriter.getF32Type());
       zp_offset = rewriter.create<mhlo::ConvertOp>(
           op->getLoc(), zp_offset_float32_tensor_type, zp_offset);
       zp_offset = rewriter.create<chlo::BroadcastMulOp>(
@@ -867,15 +893,12 @@ template <typename DotLikeOp>
 FailureOr<bool> IsDotLikeOpHybrid(DotLikeOp op) {
   // Checks whether a dot-like op is hybrid by looking at input/output types.
   // Returns failure() when the type is not supported.
-  auto lhs_element_quant_type =
-      getElementTypeOrSelf(op.getLhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
-  auto rhs_element_quant_type =
-      getElementTypeOrSelf(op.getRhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
-  auto res_element_quant_type =
-      getElementTypeOrSelf(op.getResult())
-          .template dyn_cast<quant::UniformQuantizedType>();
+  auto lhs_element_quant_type = getElementTypeOrSelf(op.getLhs().getType())
+                                    .template dyn_cast<UniformQuantizedType>();
+  auto rhs_element_quant_type = getElementTypeOrSelf(op.getRhs().getType())
+                                    .template dyn_cast<UniformQuantizedType>();
+  auto res_element_quant_type = getElementTypeOrSelf(op.getResult())
+                                    .template dyn_cast<UniformQuantizedType>();
   if (lhs_element_quant_type && rhs_element_quant_type &&
       res_element_quant_type) {
     return false;
@@ -996,8 +1019,7 @@ bool IsConvNDHWC(const mhlo::ConvDimensionNumbersAttr &dims) {
 FailureOr<DotLikeDimensionNumbers> VerifyConvolutionOp(mhlo::ConvolutionOp op) {
   // RHS (weight) must have zero zp.
   auto rhs_element_quant_type =
-      getElementTypeOrSelf(op.getRhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
+      getElementTypeOrSelf(op.getRhs().getType()).cast<UniformQuantizedType>();
   if (rhs_element_quant_type.getZeroPoint() != 0) {
     op->emitError("RHS UQ type must have zero zp.");
     return failure();
@@ -1074,15 +1096,15 @@ class ConvertGenericOp : public ConversionPattern {
     // Check that all operands and result uq types are the same.
     llvm::SmallVector<Type> uq_types;
     for (auto result_type : op->getResultTypes()) {
-      auto type = getElementTypeOrSelf(result_type)
-                      .dyn_cast<quant::UniformQuantizedType>();
+      auto type =
+          getElementTypeOrSelf(result_type).dyn_cast<UniformQuantizedType>();
       if (type) {
         uq_types.push_back(type);
       }
     }
     for (auto operand : op->getOperands()) {
       auto type = getElementTypeOrSelf(operand.getType())
-                      .dyn_cast<quant::UniformQuantizedType>();
+                      .dyn_cast<UniformQuantizedType>();
       if (type) {
         uq_types.push_back(type);
       }
@@ -1097,15 +1119,7 @@ class ConvertGenericOp : public ConversionPattern {
     // type otherwise.
     llvm::SmallVector<Type, 4> new_result_types;
     for (auto result_type : op->getResultTypes()) {
-      if (getElementTypeOrSelf(result_type)
-              .isa<quant::UniformQuantizedType>()) {
-        new_result_types.push_back(result_type.cast<TensorType>().clone(
-            getElementTypeOrSelf(result_type)
-                .cast<quant::UniformQuantizedType>()
-                .getStorageType()));
-      } else {
-        new_result_types.push_back(result_type);
-      }
+      new_result_types.push_back(GetQuantStorageType(result_type));
     }
 
     OperationState state(op->getLoc(), op->getName().getStringRef(), operands,
@@ -1120,73 +1134,78 @@ class ConvertGenericOp : public ConversionPattern {
 class UQTypeConverter : public TypeConverter {
  public:
   UQTypeConverter() {
-    addConversion([](Type type) -> Type {
-      auto to_legal_type = [](Type type) {
-        if (auto uq_type = dyn_cast<quant::UniformQuantizedType>(type)) {
-          return uq_type.getStorageType();
-        }
-        return type;
-      };
-      if (auto shaped = type.dyn_cast<ShapedType>()) {
-        return shaped.clone(to_legal_type(shaped.getElementType()));
-      } else {
-        return to_legal_type(type);
-      }
-    });
+    addConversion([](Type type) -> Type { return GetQuantStorageType(type); });
   }
 };
 
-// Performs conversion of MHLO quant ops to primitive ops.
-void ConvertMHLOQuantToInt::runOnOperation() {
-  Operation *op = getOperation();
-  MLIRContext *context = op->getContext();
-  RewritePatternSet patterns(context);
-
-  // Populate MHLO quant ops conversion patterns.
-  patterns.add<ConvertUniformQuantizeOp, ConvertUniformDequantizeOp,
-               ConvertUniformQuantizedAddOp, ConvertUniformQuantizedDotOp,
-               ConvertUniformQuantizedDotGeneralOp,
-               ConvertUniformQuantizedConvolutionOp, ConvertGenericOp>(context);
-
-  // uq->int convert patterns for func.func and func.return.
-  UQTypeConverter converter;
-  populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
-                                                                 converter);
-  populateReturnOpTypeConversionPattern(patterns, converter);
-
-  ConversionTarget target(*op->getContext());
-  auto is_legal = [&converter](Operation *op) { return converter.isLegal(op); };
-  target.addDynamicallyLegalDialect<mhlo::MhloDialect>(is_legal);
-  target.addDynamicallyLegalDialect<chlo::ChloDialect>(is_legal);
-  target.addDynamicallyLegalDialect<func::FuncDialect>(
-      [&converter](Operation *op) {
-        if (auto func = dyn_cast<func::FuncOp>(op)) {
-          return converter.isSignatureLegal(func.getFunctionType());
-        }
-        return converter.isLegal(op);
-      });
-
-  LogicalResult result =
-      applyPartialConversion(op, target, std::move(patterns));
-  if (failed(result)) {
-    signalPassFailure();
+#define GEN_PASS_DEF_CONVERTMHLOQUANTTOINT
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h.inc"
+
+class ConvertMHLOQuantToInt
+    : public impl::ConvertMHLOQuantToIntBase<ConvertMHLOQuantToInt> {
+ public:
+  ConvertMHLOQuantToInt() = default;
+  ConvertMHLOQuantToInt(const ConvertMHLOQuantToInt &) {}
+
+  explicit ConvertMHLOQuantToInt(bool legalize_chlo) {
+    legalize_chlo_ = legalize_chlo;
   }
 
-  // Legalize CHLO if needed.
-  if (!legalize_chlo_) return;
-  RewritePatternSet patterns_2(context);
+  // Performs conversion of MHLO quant ops to primitive ops.
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    MLIRContext *context = op->getContext();
+    RewritePatternSet patterns(context);
+
+    // Populate MHLO quant ops conversion patterns.
+    patterns.add<ConvertUniformQuantizeOp, ConvertUniformDequantizeOp,
+                 ConvertUniformQuantizedAddOp, ConvertUniformQuantizedDotOp,
+                 ConvertUniformQuantizedDotGeneralOp,
+                 ConvertUniformQuantizedConvolutionOp, ConvertGenericOp>(
+        context);
+
+    // uq->int convert patterns for func.func and func.return.
+    UQTypeConverter converter;
+    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
+                                                                   converter);
+    populateReturnOpTypeConversionPattern(patterns, converter);
+
+    ConversionTarget target(*op->getContext());
+    auto is_legal = [&converter](Operation *op) {
+      return converter.isLegal(op);
+    };
+    target.addDynamicallyLegalDialect<mhlo::MhloDialect>(is_legal);
+    target.addDynamicallyLegalDialect<chlo::ChloDialect>(is_legal);
+    target.addDynamicallyLegalDialect<func::FuncDialect>(
+        [&converter](Operation *op) {
+          if (auto func = dyn_cast<func::FuncOp>(op)) {
+            return converter.isSignatureLegal(func.getFunctionType());
+          }
+          return converter.isLegal(op);
+        });
+
+    LogicalResult result =
+        applyPartialConversion(op, target, std::move(patterns));
+    if (failed(result)) {
+      signalPassFailure();
+    }
+
+    // Legalize CHLO if needed.
+    if (!legalize_chlo_) return;
+    RewritePatternSet patterns_2(context);
 
-  chlo::populateDecomposeChloPatterns(context, &patterns_2);
-  chlo::populateChloBroadcastingPatterns(context, &patterns_2);
+    chlo::populateDecomposeChloPatterns(context, &patterns_2);
+    chlo::populateChloBroadcastingPatterns(context, &patterns_2);
 
-  ConversionTarget target_2 =
-      mhlo::GetDefaultLegalConversionTargets(*op->getContext(), legalize_chlo_);
+    ConversionTarget target_2 = mhlo::GetDefaultLegalConversionTargets(
+        *op->getContext(), legalize_chlo_);
 
-  result = applyPartialConversion(op, target_2, std::move(patterns_2));
-  if (failed(result)) {
-    signalPassFailure();
+    result = applyPartialConversion(op, target_2, std::move(patterns_2));
+    if (failed(result)) {
+      signalPassFailure();
+    }
   }
-}
+};
 
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 9c2b88abddb8a5..dc686d8bf0fb35 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <optional>
@@ -21,6 +22,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/random/random.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -29,6 +31,8 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -65,6 +69,7 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
     dialects.insert<TF::TensorFlowDialect, func::FuncDialect, chlo::ChloDialect,
                     mhlo::MhloDialect, quant::QuantizationDialect>();
     ctx_ = std::make_unique<MLIRContext>(dialects);
+    ctx_->loadAllAvailableDialects();
 
     // Create a CPU client with 1 device.
     TF_ASSERT_OK_AND_ASSIGN(
@@ -74,11 +79,10 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
     CHECK(device_);
   }
 
-  // Evaluate return value of a function using TF kernel.
-  // This assumes that the module op has only 1 function and it has TF ops only.
-  absl::StatusOr<std::shared_ptr<xla::Literal>> EvaluateTfFunction(
+  absl::StatusOr<OwningOpRef<ModuleOp>> ReplaceFuncArgsByConstant(
       absl::string_view program,
-      absl::Span<const xla::Literal* const> arguments) {
+      absl::Span<const xla::Literal* const> arguments,
+      bool use_mhlo_const = false) {
     auto module_op = parseSourceString<ModuleOp>(program, ctx_.get());
     CHECK(module_op);
     auto func_op = llvm::dyn_cast<func::FuncOp>(
@@ -90,9 +94,9 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
       return absl::InternalError("Input argument has wrong size");
     }
 
-    // Convert input xla::Literal arguments to tf.Const, this allows using
+    // Convert input xla::Literal arguments to constants, this allows using
     // constant folding to evaluate function return value.
-    mlir::OpBuilder builder(func_op);
+    mlir::OpBuilder builder(ctx_.get());
     for (int i = 0; i < arguments.size(); ++i) {
       const xla::Literal* const xla_literal = arguments[i];
       tensorflow::TensorShape shape;
@@ -109,15 +113,38 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
                       xla_literal->element_count());
       TF_ASSIGN_OR_RETURN(auto attrs,
                           tensorflow::ConvertTensor(tensor, &builder));
-      auto cst = builder.create<TF::ConstOp>(func_op->getLoc(), attrs);
+      builder.setInsertionPoint(
+          &func_op.getFunctionBody().getBlocks().front().front());
+      // Use mhlo.Constant when it is consumed by the lowering passes since they
+      // can't lower tf.Const.
+      Value cst;
+      if (use_mhlo_const) {
+        cst = builder.create<mhlo::ConstantOp>(func_op->getLoc(), attrs);
+      } else {
+        cst = builder.create<TF::ConstOp>(func_op->getLoc(), attrs);
+      }
       func_op.getArgument(i).replaceAllUsesWith(cst);
     }
+    return module_op;
+  }
 
+  // Evaluate return value of a function using TF kernel.
+  // This assumes that the module op has only 1 function and it has TF ops only.
+  absl::StatusOr<std::shared_ptr<xla::Literal>> EvaluateTfFunction(
+      absl::string_view program,
+      absl::Span<const xla::Literal* const> arguments) {
+    TF_ASSIGN_OR_RETURN(auto module_op,
+                        ReplaceFuncArgsByConstant(program, arguments));
     // Constant fold the func.Return op's producer op to evaluate the return
     // value. The evaluation will use TF kernels.
     // This assumes that func.Return is the last op in the function and it
     // returns only 1 value.
-    auto& return_op = func_op.getFunctionBody().getBlocks().back().back();
+    auto& return_op = llvm::dyn_cast<func::FuncOp>(
+                          *module_op->getBodyRegion().getOps().begin())
+                          .getFunctionBody()
+                          .getBlocks()
+                          .back()
+                          .back();
     if (!llvm::isa<func::ReturnOp>(return_op) ||
         return_op.getNumOperands() != 1) {
       return absl::InternalError(
@@ -150,17 +177,19 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
   }
 
   absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileProgram(
-      absl::string_view program) {
-    // Parse the program.
-    auto module_op = parseSourceString<ModuleOp>(program, ctx_.get());
-    CHECK(module_op);
+      absl::string_view program,
+      absl::Span<const xla::Literal* const> arguments) {
+    // Replace args by mhlo.constant since the lowering passes can't lower
+    // tf.Const.
+    TF_ASSIGN_OR_RETURN(
+        auto module_op,
+        ReplaceFuncArgsByConstant(program, arguments, /*use_mhlo_const=*/true));
+
     // Run the Convert TF Quant Types, TF Quant -> MHLO Quant and MHLO Quant ->
     // MHLO int passes.
     PassManager pm(module_op->getContext());
     pm.addNestedPass<func::FuncOp>(CreateConvertTFQuantTypesPass());
-    pm.addNestedPass<func::FuncOp>(CreateConvertTFQuantOpsToMHLOPass());
-    pm.addNestedPass<func::FuncOp>(
-        stablehlo::createConvertMHLOQuantToIntPass(false));
+    AddQuantizationLoweringPasses(pm);
     CHECK(succeeded(pm.run(module_op.get())));
     // Compile the program.
     return pjrt_client_->Compile(*module_op, xla::CompileOptions{});
@@ -190,206 +219,366 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
   void ExecuteAndCompareResultsWithTfKernel(
       absl::string_view program,
       absl::Span<const xla::Literal* const> arguments,
-      float error_tolerance = 0.1) {
-    TF_ASSERT_OK_AND_ASSIGN(auto executable, this->CompileProgram(program));
+      std::optional<absl::string_view> tf_program = std::nullopt,
+      double error_tolerance = 0.1) {
+    // Expected result is calculated by evaluating using TF kernels. In some
+    // cases, TF kernel behaves differently from lowered graph (e.g. Hybrid
+    // ops). So we optionally use a different graph to calculate the expected
+    // result.
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto expected,
+        this->EvaluateTfFunction(
+            (tf_program.has_value() ? *tf_program : program), arguments));
 
+    TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                            this->CompileProgram(program, arguments));
     TF_ASSERT_OK_AND_ASSIGN(
-        auto result_literal,
+        auto result,
         this->ExecuteProgramAndReturnSingleResult(executable.get(), arguments));
 
-    TF_ASSERT_OK_AND_ASSIGN(auto expected,
-                            this->EvaluateTfFunction(program, arguments));
-    EXPECT_TRUE(xla::LiteralTestUtil::Near(*expected, *result_literal,
+    // Convert to double for comparison. This is needed for comparing integers
+    // since it LiteralTestUtil asserts different integers even if it is within
+    // error_spec.
+    TF_ASSERT_OK_AND_ASSIGN(auto expected_double, expected->Convert(xla::F64))
+    TF_ASSERT_OK_AND_ASSIGN(auto result_double, result->Convert(xla::F64))
+    EXPECT_TRUE(xla::LiteralTestUtil::Near(expected_double, result_double,
                                            xla::ErrorSpec(error_tolerance)));
   }
 
+  absl::StatusOr<xla::Literal> CreateRandomF32Literal(
+      absl::Span<const int64_t> dims, float min = -100, float max = 100) {
+    TF_ASSIGN_OR_RETURN(auto shape,
+                        xla::ShapeUtil::MakeValidatedShape(xla::F32, dims));
+    return xla::LiteralUtil::CreateLiteralWithGenerator<xla::F32, float>(
+        shape, [this, min, max](absl::Span<const int64_t> dims) -> float {
+          return absl::Uniform(bitgen_, min, max);
+        });
+  }
+
+  absl::StatusOr<xla::Literal> CreateRandomI8Literal(
+      absl::Span<const int64_t> dims, int8_t min = -128, int8_t max = 127) {
+    TF_ASSIGN_OR_RETURN(auto shape,
+                        xla::ShapeUtil::MakeValidatedShape(xla::S8, dims));
+    return xla::LiteralUtil::CreateLiteralWithGenerator<xla::S8, int8_t>(
+        shape, [this, min, max](absl::Span<const int64_t> dims) -> int8_t {
+          return absl::Uniform(bitgen_, min, max);
+        });
+  }
+
+  absl::StatusOr<xla::Literal> CreateRandomI32Literal(
+      absl::Span<const int64_t> dims, int32_t min = -128, int32_t max = 127) {
+    TF_ASSIGN_OR_RETURN(auto shape,
+                        xla::ShapeUtil::MakeValidatedShape(xla::S32, dims));
+    return xla::LiteralUtil::CreateLiteralWithGenerator<xla::S32, int32_t>(
+        shape, [this, min, max](absl::Span<const int64_t> dims) -> int32_t {
+          return absl::Uniform(bitgen_, min, max);
+        });
+  }
+
   std::unique_ptr<MLIRContext> ctx_;
   std::unique_ptr<xla::PjRtClient> pjrt_client_;
   xla::PjRtDevice* device_;
+  absl::BitGen bitgen_;
 };
 
 TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAndDequantize) {
   constexpr absl::string_view kProgram = R"mlir(
-func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  %scale = "tf.Const"() { value = dense<10.0> : tensor<f32> } : ()
-    -> tensor<f32>
+func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %scale = "tf.Const"() { value = dense<0.347> : tensor<f32> } : () -> tensor<f32>
   %zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.UniformQuantize"(%arg0, %scale, %zp) {
     quantization_axis = -1 : i64,
     quantization_min_val = -128 : i64,
     quantization_max_val = 127 : i64
-  } : (tensor<4xf32>, tensor<f32>, tensor<i32>) -> tensor<4x!tf_type.qint8>
+  } : (tensor<10xf32>, tensor<f32>, tensor<i32>) -> tensor<10x!tf_type.qint8>
   %1 = "tf.UniformDequantize"(%0, %scale, %zp) {
     quantization_axis = -1 : i64,
     quantization_min_val = -128 : i64,
     quantization_max_val = 127 : i64
-  } : (tensor<4x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  } : (tensor<10x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<10xf32>
+  return %1 : tensor<10xf32>
 })mlir";
-  auto arg0 =
-      xla::LiteralUtil::CreateR1<float>({100.0f, 20000.0f, -2409.0f, -25.1f});
-  ExecuteAndCompareResultsWithTfKernel(kProgram, {&arg0});
+  TF_ASSERT_OK_AND_ASSIGN(auto arg0, CreateRandomF32Literal({10}));
+  // error_tolerance is set to be slightly > scale because different rounding
+  // implementations for UniformQuantize in TF kernel and the lowering passes
+  // may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(
+      kProgram, {&arg0}, /*tf_program=*/std::nullopt, /*error_tolerance=*/0.35);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizePerChannel) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(
+    %arg0: tensor<10x10xf32>, %scale: tensor<10xf32>, %zp: tensor<10xi32>
+  ) -> tensor<10x10xi8> {
+  %0 = "tf.UniformQuantize"(%arg0, %scale, %zp) {
+    quantization_axis = 1 : i64,
+    quantization_min_val = -128 : i64,
+    quantization_max_val = 127 : i64
+  } : (tensor<10x10xf32>, tensor<10xf32>, tensor<10xi32>) -> tensor<10x10x!tf_type.qint8>
+  %1 = "tf.Cast"(%0) {} : (tensor<10x10x!tf_type.qint8>) -> tensor<10x10xi8>
+  return %1 : tensor<10x10xi8>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto arg0, CreateRandomF32Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto scale, CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto zp, CreateRandomI32Literal({10}));
+  // Different rounding implementations for UniformQuantize in TF kernel and the
+  // lowering passes may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&arg0, &scale, &zp},
+                                       /*tf_program=*/std::nullopt,
+                                       /*error_tolerance=*/1.0);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest, UniformDequantizePerChannel) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(
+    %arg0: tensor<10x10xi8>, %scale: tensor<10xf32>, %zp: tensor<10xi32>
+  ) -> tensor<10x10xf32> {
+  %0 = "tf.Cast"(%arg0) {} : (tensor<10x10xi8>) -> tensor<10x10x!tf_type.qint8>
+  %1 = "tf.UniformDequantize"(%0, %scale, %zp) {
+    quantization_axis = 1 : i64,
+    quantization_min_val = -128 : i64,
+    quantization_max_val = 127 : i64
+  } : (tensor<10x10x!tf_type.qint8>, tensor<10xf32>, tensor<10xi32>) -> tensor<10x10xf32>
+  return %1 : tensor<10x10xf32>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto arg0, CreateRandomI8Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto scale, CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto zp, CreateRandomI32Literal({10}));
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&arg0, &scale, &zp});
 }
 
 TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolution) {
   constexpr absl::string_view kProgram = R"mlir(
-func.func @main(%input: tensor<1x2x2x1xf32>, %filter: tensor<2x1x1x1xf32>) -> tensor<1x2x2x1xf32> {
-    %input_scale = "tf.Const"() { value = dense<7.3> : tensor<f32> } : ()
-    -> tensor<f32>
-    %input_zp = "tf.Const"() { value = dense<-45> : tensor<i32> } : () -> tensor<i32>
-    %filter_scale = "tf.Const"() { value = dense<0.047> : tensor<f32> } : ()
-    -> tensor<f32>
-    %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %accum_scale = "tf.Const"() { value = dense<0.3431> : tensor<f32> } : ()
-    -> tensor<f32>
-    %accum_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %quant_input = "tf.UniformQuantize"(%input, %input_scale, %input_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8",
-      attr_map = "", quantization_axis = -1 : i64, quantization_max_val = 127 : i64,
-      quantization_min_val = -128 : i64
-    } : (tensor<1x2x2x1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x1x!tf_type.qint8>
-    %quant_filter = "tf.UniformQuantize"(%filter, %filter_scale, %filter_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8",
-      attr_map = "", quantization_axis = -1 : i64,
-      quantization_max_val = 127 : i64, quantization_min_val = -128 : i64
-    } : (tensor<2x1x1x1xf32>, tensor<f32>, tensor<i32>) -> tensor<2x1x1x1x!tf_type.qint8>
-    %0 = "tf.UniformQuantizedConvolution"(
-      %quant_input, %quant_filter, %input_scale, %input_zp,
-      %filter_scale, %filter_zp, %accum_scale, %accum_zp
-    ) {
-      Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT32",
-      attr_map = "", batch_group_count = 1 : i64,
-      dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
-      explicit_padding = [], feature_group_count = 1 : i64, lhs_dilation = [1, 1],
-      lhs_quantization_axis = -1 : i64, lhs_quantization_max_val = 127 : i64,
-      lhs_quantization_min_val = -128 : i64, output_quantization_axis = -1 : i64,
-      output_quantization_max_val = 2147483647 : i64,
-      output_quantization_min_val = -2147483648 : i64, padding = "SAME",
-      rhs_dilation = [1, 1], rhs_quantization_axis = -1 : i64,
-      rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64,
-      window_strides = [1, 1]
-    } : (tensor<1x2x2x1x!tf_type.qint8>, tensor<2x1x1x1x!tf_type.qint8>,
-      tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>
-    ) -> tensor<1x2x2x1x!tf_type.qint32>
-    %output = "tf.UniformDequantize"(%0, %accum_scale, %accum_zp) {
-      quantization_axis = -1 : i64, quantization_min_val = -128 : i64,
-      quantization_max_val = 127 : i64
-    } : (tensor<1x2x2x1x!tf_type.qint32>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x1xf32>
-    return %output : tensor<1x2x2x1xf32>
+func.func @main(%input: tensor<1x9x9x9xi8>, %filter: tensor<3x3x9x10xi8>) -> tensor<1x9x9x10xi32> {
+  %input_scale = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %input_zp = "tf.Const"() { value = dense<-10> : tensor<i32> } : () -> tensor<i32>
+  %filter_scale = "tf.Const"() { value = dense<0.5> : tensor<f32> } : () -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %accum_scale = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %accum_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_input = "tf.Cast"(%input) {} : (tensor<1x9x9x9xi8>) ->
+    tensor<1x9x9x9x!tf_type.qint8>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<3x3x9x10xi8>) ->
+    tensor<3x3x9x10x!tf_type.qint8>
+  %0 = "tf.UniformQuantizedConvolution"(
+    %quant_input, %quant_filter, %input_scale, %input_zp,
+    %filter_scale, %filter_zp, %accum_scale, %accum_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT32",
+    attr_map = "", batch_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    explicit_padding = [], feature_group_count = 1 : i64, lhs_dilation = [1, 1],
+    lhs_quantization_axis = -1 : i64, lhs_quantization_max_val = 127 : i64,
+    lhs_quantization_min_val = -128 : i64, output_quantization_axis = -1 : i64,
+    output_quantization_max_val = 2147483647 : i64,
+    output_quantization_min_val = -2147483648 : i64, padding = "SAME",
+    rhs_dilation = [1, 1], rhs_quantization_axis = -1 : i64,
+    rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64,
+    window_strides = [1, 1]
+  } : (tensor<1x9x9x9x!tf_type.qint8>, tensor<3x3x9x10x!tf_type.qint8>,
+    tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>
+  ) -> tensor<1x9x9x10x!tf_type.qint32>
+  %output = "tf.Cast"(%0) {} : (tensor<1x9x9x10x!tf_type.qint32>) -> tensor<1x9x9x10xi32>
+  return %output : tensor<1x9x9x10xi32>
 })mlir";
-  auto input = xla::LiteralUtil::CreateR4<float>(
-      {{{{14.f}, {-100.f}}, {{-200.f}, {350.f}}}});
-  auto filter = xla::LiteralUtil::CreateR4<float>({{{{4.1f}}}, {{{-2.f}}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({1, 9, 9, 9}));
+  TF_ASSERT_OK_AND_ASSIGN(auto filter, CreateRandomI8Literal({3, 3, 9, 10}));
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter});
 }
 
 TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolutionHybrid) {
+  constexpr absl::string_view kTfProgram = R"mlir(
+func.func @main(%input: tensor<2x10x10x10xf32>, %filter: tensor<3x3x10x20xi8>) -> tensor<2x10x10x20xf32> {
+  %filter_scale = "tf.Const"() { value = dense<0.047> : tensor<f32> } : () -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<3x3x10x20xi8>) ->
+    tensor<3x3x10x20x!tf_type.qint8>
+  %filter_new = "tf.UniformDequantize"(%quant_filter, %filter_scale, %filter_zp) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64,
+    quantization_max_val = 127 : i64
+  } : (
+    tensor<3x3x10x20x!tf_type.qint8>, tensor<f32>, tensor<i32>
+  ) -> tensor<3x3x10x20xf32>
+  %0 = "tf.Conv2D"(%input, %filter_new) {
+    Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_FLOAT",
+    attr_map = "", batch_group_count = 1 : i64,
+    explicit_padding = [], feature_group_count = 1 : i64, lhs_dilation = [1, 1],
+    padding = "SAME", rhs_dilation = [1, 1], strides = [1, 1, 1, 1]
+  } : (tensor<2x10x10x10xf32>, tensor<3x3x10x20xf32>) -> tensor<2x10x10x20xf32>
+  return %0 : tensor<2x10x10x20xf32>
+})mlir";
   constexpr absl::string_view kProgram = R"mlir(
-func.func @main(%input: tensor<1x2x2x1xf32>, %filter: tensor<2x1x1x1xf32>) -> tensor<1x2x2x1xf32> {
-    %filter_scale = "tf.Const"() { value = dense<0.047> : tensor<f32> } : ()
-    -> tensor<f32>
-    %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %quant_filter = "tf.UniformQuantize"(%filter, %filter_scale, %filter_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8",
-      attr_map = "", quantization_axis = -1 : i64,
-      quantization_max_val = 127 : i64, quantization_min_val = -128 : i64
-    } : (tensor<2x1x1x1xf32>, tensor<f32>, tensor<i32>) -> tensor<2x1x1x1x!tf_type.qint8>
-    %0 = "tf.UniformQuantizedConvolutionHybrid"(
-      %input, %quant_filter, %filter_scale, %filter_zp
-    ) {
-      Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_FLOAT",
-      attr_map = "", batch_group_count = 1 : i64,
-      dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
-      explicit_padding = [], feature_group_count = 1 : i64, lhs_dilation = [1, 1],
-      padding = "SAME", rhs_dilation = [1, 1], rhs_quantization_axis = -1 : i64,
-      rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64,
-      window_strides = [1, 1]
-    } : (tensor<1x2x2x1xf32>, tensor<2x1x1x1x!tf_type.qint8>,
-      tensor<f32>, tensor<i32>) -> tensor<1x2x2x1xf32>
-    return %0 : tensor<1x2x2x1xf32>
+func.func @main(%input: tensor<2x10x10x10xf32>, %filter: tensor<3x3x10x20xi8>) -> tensor<2x10x10x20xf32> {
+  %filter_scale = "tf.Const"() { value = dense<0.047> : tensor<f32> } : () -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<3x3x10x20xi8>) -> tensor<3x3x10x20x!tf_type.qint8>
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(
+    %input, %quant_filter, %filter_scale, %filter_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_FLOAT",
+    attr_map = "", batch_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    explicit_padding = [], feature_group_count = 1 : i64, lhs_dilation = [1, 1],
+    padding = "SAME", rhs_dilation = [1, 1], rhs_quantization_axis = -1 : i64,
+    rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64,
+    window_strides = [1, 1]
+  } : (tensor<2x10x10x10xf32>, tensor<3x3x10x20x!tf_type.qint8>,
+    tensor<f32>, tensor<i32>) -> tensor<2x10x10x20xf32>
+  return %0 : tensor<2x10x10x20xf32>
 })mlir";
-  auto input = xla::LiteralUtil::CreateR4<float>(
-      {{{{14.f}, {-100.f}}, {{-200.f}, {350.f}}}});
-  auto filter = xla::LiteralUtil::CreateR4<float>({{{{4.1f}}}, {{{-2.f}}}});
-  // The large tolerance here is expected because
-  // tf.UniformQuantizedConvolutionHybrid does DRQ. But StableHLO hybrid ops
-  // does weight-only.
-  ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter},
-                                       /*error_tolerance=*/5.0);
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomF32Literal({2, 10, 10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(auto filter, CreateRandomI8Literal({3, 3, 10, 20}));
+  // TF kernels for UniformQuantizedConvolutionHybrid does DRQ. But StableHLO
+  // hybrid ops does weight-only. So we use a different TF graph for evaluating
+  // expected weight-only quantized results.
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter}, kTfProgram);
 }
 
 TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDot) {
   constexpr absl::string_view kProgram = R"mlir(
-func.func @main(%input: tensor<1x2xf32>, %filter: tensor<2x3xf32>) -> tensor<1x3xf32> {
-    %input_scale = "tf.Const"() { value = dense<0.588> : tensor<f32> } : ()
-    -> tensor<f32>
-    %input_zp = "tf.Const"() { value = dense<42> : tensor<i32> } : () -> tensor<i32>
-    %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : ()
-    -> tensor<f32>
-    %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %accum_scale = "tf.Const"() { value = dense<0.013818> : tensor<f32> } : ()
-    -> tensor<f32>
-    %accum_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %quant_input = "tf.UniformQuantize"(%input, %input_scale, %input_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8", attr_map = "",
-      quantization_axis = -1 : i64, quantization_max_val = 127 : i64,
-      quantization_min_val = -128 : i64
-    } : (tensor<1x2xf32>, tensor<f32>, tensor<i32>) -> tensor<1x2x!tf_type.qint8>
-    %quant_filter = "tf.UniformQuantize"(%filter, %filter_scale, %filter_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8", attr_map = "",
-      quantization_axis = -1 : i64, quantization_max_val = 127 : i64,
-      quantization_min_val = -128 : i64
-    } : (tensor<2x3xf32>, tensor<f32>, tensor<i32>) -> tensor<2x3x!tf_type.qint8>
-    %0 = "tf.UniformQuantizedDot"(
-      %quant_input, %quant_filter, %input_scale, %input_zp, %filter_scale,
-      %filter_zp, %accum_scale, %accum_zp
-    ) {
-      Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT32", attr_map = "",
-      device = "", lhs_quantization_axis = -1 : i64,
-      lhs_quantization_max_val = 127 : i64, lhs_quantization_min_val = -128 : i64,
-      output_quantization_axis = -1 : i64, output_quantization_max_val = 2147483647 : i64,
-      output_quantization_min_val = -2147483648 : i64, rhs_quantization_axis = -1 : i64,
-      rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64
-    } : (
-      tensor<1x2x!tf_type.qint8>, tensor<2x3x!tf_type.qint8>, tensor<f32>,
-      tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>
-    ) -> tensor<1x3x!tf_type.qint32>
-    %output = "tf.UniformDequantize"(%0, %accum_scale, %accum_zp) {
-      quantization_axis = -1 : i64, quantization_min_val = -128 : i64,
-      quantization_max_val = 127 : i64
-    } : (tensor<1x3x!tf_type.qint32>, tensor<f32>, tensor<i32>) -> tensor<1x3xf32>
-    return %output : tensor<1x3xf32>
+func.func @main(%input: tensor<8x9xi8>, %filter: tensor<9x10xi8>) -> tensor<8x10xi32> {
+  %input_scale = "tf.Const"() { value = dense<0.588> : tensor<f32> } : () -> tensor<f32>
+  %input_zp = "tf.Const"() { value = dense<42> : tensor<i32> } : () -> tensor<i32>
+  %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : () -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %accum_scale = "tf.Const"() { value = dense<0.013818> : tensor<f32> } : () -> tensor<f32>
+  %accum_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_input = "tf.Cast"(%input) {} : (tensor<8x9xi8>) -> tensor<8x9x!tf_type.qint8>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<9x10xi8>) -> tensor<9x10x!tf_type.qint8>
+  %0 = "tf.UniformQuantizedDot"(
+    %quant_input, %quant_filter, %input_scale, %input_zp, %filter_scale,
+    %filter_zp, %accum_scale, %accum_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT32", attr_map = "",
+    device = "", lhs_quantization_axis = -1 : i64,
+    lhs_quantization_max_val = 127 : i64,
+    lhs_quantization_min_val = -128 : i64,
+    output_quantization_axis = -1 : i64,
+    output_quantization_max_val = 2147483647 : i64,
+    output_quantization_min_val = -2147483648 : i64,
+    rhs_quantization_axis = -1 : i64,
+    rhs_quantization_max_val = 127 : i64,
+    rhs_quantization_min_val = -128 : i64
+  } : (
+    tensor<8x9x!tf_type.qint8>, tensor<9x10x!tf_type.qint8>, tensor<f32>,
+    tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>
+  ) -> tensor<8x10x!tf_type.qint32>
+  %output = "tf.Cast"(%0) {} : (tensor<8x10x!tf_type.qint32>) -> tensor<8x10xi32>
+  return %output : tensor<8x10xi32>
 })mlir";
-  auto input = xla::LiteralUtil::CreateR2<float>({{50.f, -100.f}});
-  auto filter =
-      xla::LiteralUtil::CreateR2<float>({{1.f, 2.f, 3.f}, {-1.f, -3.f, 1.f}});
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({8, 9}));
+  TF_ASSERT_OK_AND_ASSIGN(auto filter, CreateRandomI8Literal({9, 10}));
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter});
 }
 
 TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDotHybrid) {
+  constexpr absl::string_view kTfProgram = R"mlir(
+func.func @main(%input: tensor<8x9xf32>, %filter: tensor<9x10xi8>) -> tensor<8x10xf32> {
+  %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : () -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<9x10xi8>) -> tensor<9x10x!tf_type.qint8>
+  %filter_new = "tf.UniformDequantize"(%quant_filter, %filter_scale, %filter_zp) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64,
+    quantization_max_val = 127 : i64
+  } : (tensor<9x10x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<9x10xf32>
+  %0 = "tf.MatMul"(%input, %filter_new) {
+  } : (tensor<8x9xf32>, tensor<9x10xf32>) -> tensor<8x10xf32>
+  return %0 : tensor<8x10xf32>
+})mlir";
   constexpr absl::string_view kProgram = R"mlir(
-func.func @main(%input: tensor<1x2xf32>, %filter: tensor<2x3xf32>) -> tensor<1x3xf32> {
-    %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : ()
-    -> tensor<f32>
-    %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
-    %quant_filter = "tf.UniformQuantize"(%filter, %filter_scale, %filter_zp) {
-      Tin = "tfdtype$DT_FLOAT", Tout = "tfdtype$DT_QINT8", attr_map = "",
-      quantization_axis = -1 : i64, quantization_max_val = 127 : i64,
-      quantization_min_val = -128 : i64
-    } : (tensor<2x3xf32>, tensor<f32>, tensor<i32>) -> tensor<2x3x!tf_type.qint8>
-    %0 = "tf.UniformQuantizedDotHybrid"(
-      %input, %quant_filter, %filter_scale, %filter_zp
-    ) {
-      Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_FLOAT", attr_map = "",
-      device = "", rhs_quantization_axis = -1 : i64,
-      rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64
-    } : (tensor<1x2xf32>, tensor<2x3x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
+func.func @main(%input: tensor<8x9xf32>, %filter: tensor<9x10xi8>) -> tensor<8x10xf32> {
+  %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : ()
+  -> tensor<f32>
+  %filter_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_filter = "tf.Cast"(%filter) {} : (tensor<9x10xi8>) -> tensor<9x10x!tf_type.qint8>
+  %0 = "tf.UniformQuantizedDotHybrid"(
+    %input, %quant_filter, %filter_scale, %filter_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_FLOAT", attr_map = "",
+    device = "", rhs_quantization_axis = -1 : i64,
+    rhs_quantization_max_val = 127 : i64, rhs_quantization_min_val = -128 : i64
+  } : (tensor<8x9xf32>, tensor<9x10x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<8x10xf32>
+  return %0 : tensor<8x10xf32>
 })mlir";
-  auto input = xla::LiteralUtil::CreateR2<float>({{50.f, -100.f}});
-  auto filter =
-      xla::LiteralUtil::CreateR2<float>({{1.f, 2.f, 3.f}, {-1.f, -3.f, 1.f}});
-  ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter});
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomF32Literal({8, 9}));
+  TF_ASSERT_OK_AND_ASSIGN(auto filter, CreateRandomI8Literal({9, 10}));
+  // TF kernels for UniformQuantizedDotHybrid does DRQ. But StableHLO hybrid ops
+  // does weight-only. So we use a different TF graph for evaluating expected
+  // weight-only quantized results.
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter}, kTfProgram);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest, UniformRequantize) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(%input: tensor<10xi8>) -> tensor<10xi8> {
+  %input_scale = "tf.Const"() { value = dense<0.2235> : tensor<f32> } : () -> tensor<f32>
+  %input_zp = "tf.Const"() { value = dense<-2> : tensor<i32> } : () -> tensor<i32>
+  %output_scale = "tf.Const"() { value = dense<0.11> : tensor<f32> } : () -> tensor<f32>
+  %output_zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+  %0 = "tf.Cast"(%input) {} : (tensor<10xi8>) -> tensor<10x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(
+    %0, %input_scale, %input_zp, %output_scale, %output_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT8", attr_map = "",
+    device = "", input_quantization_axis = -1,
+    input_quantization_max_val = 127 : i64,
+    input_quantization_min_val = -128 : i64,
+    output_quantization_axis = -1 : i64,
+    output_quantization_max_val = 127 : i64,
+    output_quantization_min_val = -128 : i64
+  } : (
+    tensor<10x!tf_type.qint8>, tensor<f32>, tensor<i32>, tensor<f32>,
+    tensor<i32>
+  ) -> tensor<10x!tf_type.qint8>
+  %2 = "tf.Cast"(%1) {} : (tensor<10x!tf_type.qint8>) -> tensor<10xi8>
+  return %2 : tensor<10xi8>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({10}));
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&input});
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAdd) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(%lhs: tensor<10x10xi32>, %rhs: tensor<10x10xi32>) -> tensor<10x10xi32> {
+  %lhs_scale = "tf.Const"() { value = dense<0.518> : tensor<f32> } : () -> tensor<f32>
+  %lhs_zp = "tf.Const"() { value = dense<42> : tensor<i32> } : () -> tensor<i32>
+  %rhs_scale = "tf.Const"() { value = dense<0.0239> : tensor<f32> } : () -> tensor<f32>
+  %rhs_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %accum_scale = "tf.Const"() { value = dense<0.013> : tensor<f32> } : () -> tensor<f32>
+  %accum_zp = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  %quant_lhs = "tf.Cast"(%lhs) {} : (tensor<10x10xi32>) -> tensor<10x10x!tf_type.qint32>
+  %quant_rhs = "tf.Cast"(%rhs) {} : (tensor<10x10xi32>) -> tensor<10x10x!tf_type.qint32>
+  %0 = "tf.UniformQuantizedAdd"(
+    %quant_lhs, %quant_rhs, %lhs_scale, %lhs_zp, %rhs_scale,
+    %rhs_zp, %accum_scale, %accum_zp
+  ) {
+    Tin = "tfdtype$DT_QINT32", Tout = "tfdtype$DT_QINT32", attr_map = "",
+    device = "", lhs_quantization_axis = -1 : i64,
+    lhs_quantization_max_val = 2147483647 : i64,
+    lhs_quantization_min_val = -2147483648 : i64,
+    output_quantization_axis = -1 : i64,
+    output_quantization_max_val = 2147483647 : i64,
+    output_quantization_min_val = -2147483648 : i64,
+    rhs_quantization_axis = -1 : i64,
+    rhs_quantization_max_val = 2147483647 : i64,
+    rhs_quantization_min_val = -2147483648 : i64
+  } : (
+    tensor<10x10x!tf_type.qint32>, tensor<10x10x!tf_type.qint32>, tensor<f32>,
+    tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>
+  ) -> tensor<10x10x!tf_type.qint32>
+  %1 = "tf.Cast"(%0) {} : (tensor<10x10x!tf_type.qint32>) ->  tensor<10x10xi32>
+  return %1 : tensor<10x10xi32>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto lhs, CreateRandomI32Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(auto rhs, CreateRandomI32Literal({10, 10}));
+  // error_tolerance is set to be 1 because different rounding implementations
+  // in TF kernel and the lowering passes may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(kProgram, {&lhs, &rhs},
+                                       /*tf_program=*/std::nullopt,
+                                       /*error_tolerance=*/1.0);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
index a449b4d59018d4..116037d9130df2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-include "mlir/IR/OpBase.td"
-include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Shape/IR/ShapeOps.td"
+include "mlir/IR/OpBase.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "stablehlo/dialect/StablehloOps.td"
@@ -47,7 +48,7 @@ def LiftConvWithBias : Pat<
       (NamedAttr<"feature_group_count"> $feature_group_count),
       (NamedAttr<"batch_group_count"> $batch_group_count),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
-  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 5)>;
 
 def LiftDotGeneralWithBias : Pat<
   (StableHLO_AddOp:$res
@@ -60,7 +61,44 @@ def LiftDotGeneralWithBias : Pat<
     (NamedAttributeList
       (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
-  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 5)>;
+
+def LiftConvWithBiasDynamic : Pat<
+  (StableHLO_AddOp:$res
+    (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
+        $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
+        $feature_group_count, $batch_group_count, $precision_config),
+    (StableHLO_DynamicBroadcastInDimOp
+      $bias,
+      (Shape_ShapeOfOp $conv), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_conv_with_bias_dynamic_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"window_strides"> (DefaultOrNullAttr $window_strides)),
+      (NamedAttr<"padding"> (DefaultOrNullAttr $padding)),
+      (NamedAttr<"lhs_dilation"> (DefaultOrNullAttr $lhs_dilation)),
+      (NamedAttr<"rhs_dilation"> (DefaultOrNullAttr $rhs_dilation)),
+      (NamedAttr<"window_reversal"> (DefaultOrNullAttr $window_reversal)),
+      (NamedAttr<"dimension_numbers"> $dimension_numbers),
+      (NamedAttr<"feature_group_count"> $feature_group_count),
+      (NamedAttr<"batch_group_count"> $batch_group_count),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
+
+def LiftDotGeneralWithBiasDynamic : Pat<
+  (StableHLO_AddOp:$res
+    (StableHLO_DotGeneralOp $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+    (StableHLO_DynamicBroadcastInDimOp
+      $bias,
+      (Shape_ShapeOfOp $dot_general), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_dynamic_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
 
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops with activation as functions
@@ -101,6 +139,45 @@ def LiftDotGeneralWithRelu : Pat<
   [(IsNotInLiftedFunc $res),
    (FloatValueEquals<"0"> $cst)], [], (addBenefit 10)>;
 
+def LiftConvWithReluDynamic : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
+        $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
+        $feature_group_count, $batch_group_count, $precision_config),
+    (StableHLO_DynamicBroadcastInDimOp
+      (StableHLO_ConstantOp $cst),
+      (Shape_ShapeOfOp $conv), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_conv_with_relu_dynamic_fn">
+    (ArgumentList $lhs, $rhs),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"window_strides"> (DefaultOrNullAttr $window_strides)),
+      (NamedAttr<"padding"> (DefaultOrNullAttr $padding)),
+      (NamedAttr<"lhs_dilation"> (DefaultOrNullAttr $lhs_dilation)),
+      (NamedAttr<"rhs_dilation"> (DefaultOrNullAttr $rhs_dilation)),
+      (NamedAttr<"window_reversal"> (DefaultOrNullAttr $window_reversal)),
+      (NamedAttr<"dimension_numbers"> $dimension_numbers),
+      (NamedAttr<"feature_group_count"> $feature_group_count),
+      (NamedAttr<"batch_group_count"> $batch_group_count),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst)], [], (addBenefit 15)>;
+
+def LiftDotGeneralWithReluDynamic : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_DotGeneralOp $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+    (StableHLO_DynamicBroadcastInDimOp
+      (StableHLO_ConstantOp $cst),
+      (Shape_ShapeOfOp $dot_general), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_relu_dynamic_fn">
+    (ArgumentList $lhs, $rhs),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst)], [], (addBenefit 15)>;
+
 def LiftConvWithRelu6 : Pat<
   (StableHLO_ClampOp:$res
     (StableHLO_ConstantOp $cst_0),
@@ -163,7 +240,7 @@ def LiftConvWithBiasAndRelu : Pat<
       (NamedAttr<"batch_group_count"> $batch_group_count),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
   [(IsNotInLiftedFunc $res),
-   (FloatValueEquals<"0"> $cst)], [], (addBenefit 10)>;
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
 
 def LiftDotGeneralWithBiasAndRelu : Pat<
   (StableHLO_MaxOp:$res
@@ -179,7 +256,55 @@ def LiftDotGeneralWithBiasAndRelu : Pat<
       (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
   [(IsNotInLiftedFunc $res),
-   (FloatValueEquals<"0"> $cst)], [], (addBenefit 10)>;
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
+
+def LiftConvWithBiasAndReluDynamic : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_AddOp
+      (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
+          $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
+          $feature_group_count, $batch_group_count, $precision_config),
+      (StableHLO_DynamicBroadcastInDimOp
+        $bias,
+        (Shape_ShapeOfOp $conv), $_, $_, $_)),
+    (StableHLO_DynamicBroadcastInDimOp
+      (StableHLO_ConstantOp $cst),
+      (Shape_ShapeOfOp $add), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_conv_with_bias_and_relu_dynamic_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"window_strides"> (DefaultOrNullAttr $window_strides)),
+      (NamedAttr<"padding"> (DefaultOrNullAttr $padding)),
+      (NamedAttr<"lhs_dilation"> (DefaultOrNullAttr $lhs_dilation)),
+      (NamedAttr<"rhs_dilation"> (DefaultOrNullAttr $rhs_dilation)),
+      (NamedAttr<"window_reversal"> (DefaultOrNullAttr $window_reversal)),
+      (NamedAttr<"dimension_numbers"> $dimension_numbers),
+      (NamedAttr<"feature_group_count"> $feature_group_count),
+      (NamedAttr<"batch_group_count"> $batch_group_count),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 15)>;
+
+def LiftDotGeneralWithBiasAndReluDynamic : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_AddOp
+      (StableHLO_DotGeneralOp $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+      (StableHLO_DynamicBroadcastInDimOp
+        $bias,
+        (Shape_ShapeOfOp $dot_general), $_, $_, $_)),
+    (StableHLO_DynamicBroadcastInDimOp
+      (StableHLO_ConstantOp $cst),
+      (Shape_ShapeOfOp $add), $_, $_, $_)),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_and_relu_dynamic_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 15)>;
+
 
 def LiftConvWithBiasAndRelu6 : Pat<
   (StableHLO_ClampOp:$res
@@ -203,7 +328,7 @@ def LiftConvWithBiasAndRelu6 : Pat<
       (NamedAttr<"feature_group_count"> $feature_group_count),
       (NamedAttr<"batch_group_count"> $batch_group_count),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
-  [(IsNotInLiftedFunc $res), (FloatValueEquals<"0"> $cst_0), (FloatValueEquals<"6"> $cst_1)], [], (addBenefit 10)>;
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias), (FloatValueEquals<"0"> $cst_0), (FloatValueEquals<"6"> $cst_1)], [], (addBenefit 10)>;
 
 def LiftDotGeneralWithBiasAndRelu6 : Pat<
   (StableHLO_ClampOp:$res
@@ -219,4 +344,4 @@ def LiftDotGeneralWithBiasAndRelu6 : Pat<
     (NamedAttributeList
       (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
-  [(IsNotInLiftedFunc $res), (FloatValueEquals<"0"> $cst_0), (FloatValueEquals<"6"> $cst_1)], [], (addBenefit 10)>;
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias), (FloatValueEquals<"0"> $cst_0), (FloatValueEquals<"6"> $cst_1)], [], (addBenefit 10)>;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
index 8e9cfb57f32e63..0b05069b265989 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -21,10 +21,16 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
 
 namespace mlir::quant::stablehlo {
 
+// Creates a `QuantizePass` that quantizes ops according to surrounding qcast /
+// dcast ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    const quant::QuantizationSpecs& quantization_specs);
+
 // Creates a pass that quantizes weight component of StableHLO graph.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
     const ::stablehlo::quantization::QuantizationComponentSpec&
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index 0992c042175f01..52dca7897ea05d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -76,7 +76,30 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::func::FuncOp"> {
   ];
 }
 
-
 def RestoreFunctionNamePass : Pass<"stablehlo-restore-function-name", "ModuleOp"> {
   let summary = "Restores function name from XlaCallModule op.";
 }
+
+def PostQuantizePass : Pass<"stablehlo-post-quantize", "mlir::func::FuncOp"> {
+  let summary = "Apply clean-up after quantization.";
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::quantfork::QuantizationForkDialect",
+  ];
+}
+
+def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-functions", "ModuleOp"> {
+  let summary = "Quantize composite functions with QDQ input / outputs.";
+  let options = [
+    Option<"mlir_dump_file_name_", "mlir-dump-file-name",
+        "std::optional<std::string>", /*default=*/"std::nullopt",
+        "MLIR dump file name.">
+  ];
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::quant::QuantizationDialect",
+    "mlir::quantfork::QuantizationForkDialect",
+    "TF::TensorFlowDialect",
+  ];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/post_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/post_quantize.cc
new file mode 100644
index 00000000000000..0416bbdbff73a0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/post_quantize.cc
@@ -0,0 +1,158 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_POSTQUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+// Applies clean-up patterns after quantization.
+class PostQuantizePass : public impl::PostQuantizePassBase<PostQuantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PostQuantizePass)
+
+  explicit PostQuantizePass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+// TODO: b/305815328 - Consider preserving leading and trailing QDQs for
+// ModifyIONodesPass in TFLite use cases.
+// Removes the back-to-back quantize and dequantize ops with volatile attribute.
+class RemoveVolatileQdqPattern
+    : public OpRewritePattern<quantfork::DequantizeCastOp> {
+ public:
+  explicit RemoveVolatileQdqPattern(MLIRContext* context)
+      : OpRewritePattern<quantfork::DequantizeCastOp>(context) {}
+
+  LogicalResult matchAndRewrite(quantfork::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const override {
+    auto input_op = op.getArg().getDefiningOp();
+    if (auto q = llvm::dyn_cast_or_null<quantfork::QuantizeCastOp>(input_op)) {
+      if (!q->getAttr(kVolatileOpAttrName)) return failure();
+
+      // If the quantize op is a requantize op, it is being used in other scale
+      // adjustments and should be kept. Instead, move dequantize op before the
+      // requantize op to remove the unnecessary requantize op.
+      if (auto qtype =
+              QuantizedType::getQuantizedElementType(q.getArg().getType())) {
+        rewriter.setInsertionPoint(op);
+        rewriter.replaceOpWithNewOp<quantfork::DequantizeCastOp>(
+            op, op.getResult().getType(), q.getArg());
+        return success();
+      }
+
+      op.replaceAllUsesWith(q.getArg());
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Replaces constant and uniform_quantize ops with single quantized constant op.
+class QuantizeConstPattern
+    : public OpRewritePattern<mlir::stablehlo::UniformQuantizeOp> {
+ public:
+  explicit QuantizeConstPattern(MLIRContext* context)
+      : OpRewritePattern<mlir::stablehlo::UniformQuantizeOp>(context) {}
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::UniformQuantizeOp op,
+                                PatternRewriter& rewriter) const override {
+    DenseFPElementsAttr attr;
+    if (matchPattern(op.getOperand(), m_Constant(&attr))) {
+      auto qtype = op.getResult().getType();
+      ElementsAttr quantized_attr = Quantize(attr, qtype);
+      if (quantized_attr) {
+        rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(
+            op, qtype, quantized_attr);
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+
+// Replaces quantfork.dcast with stablehlo.uniform_dequantize.
+class ConvertDequantizeCastToUniformDequantizePattern
+    : public OpRewritePattern<quantfork::DequantizeCastOp> {
+ public:
+  explicit ConvertDequantizeCastToUniformDequantizePattern(MLIRContext* context)
+      : OpRewritePattern<quantfork::DequantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(quantfork::DequantizeCastOp dq_op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::stablehlo::UniformDequantizeOp>(
+        dq_op, dq_op.getResult().getType(), dq_op.getArg());
+    return success();
+  }
+};
+
+// Replaces quantfork.qcast with stablehlo.uniform_quantize.
+class ConvertQuantizeCastToUniformQuantizePattern
+    : public OpRewritePattern<quantfork::QuantizeCastOp> {
+ public:
+  explicit ConvertQuantizeCastToUniformQuantizePattern(MLIRContext* context)
+      : OpRewritePattern<quantfork::QuantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(quantfork::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::stablehlo::UniformQuantizeOp>(
+        q_op, q_op.getResult().getType(), q_op.getArg());
+    return success();
+  }
+};
+
+void PostQuantizePass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+  // TODO: b/307463853 - Consider splitting passes for each pattern set.
+  patterns.add<FoldTrivalRequantizeOp<quantfork::QuantizeCastOp>,
+               RemoveVolatileQdqPattern>(ctx);
+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+
+  RewritePatternSet patterns_2(&getContext());
+  patterns_2
+      .add<ConvertDequantizeCastToUniformDequantizePattern,
+           ConvertQuantizeCastToUniformQuantizePattern, QuantizeConstPattern>(
+          ctx);
+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns_2)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index c5b7a3a3a3420d..16e7ad1cfd7010 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -137,4 +138,9 @@ void QuantizePass::runOnOperation() {
 
 }  // namespace
 
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    const QuantizationSpecs& quantization_specs) {
+  return std::make_unique<QuantizePass>(quantization_specs);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
new file mode 100644
index 00000000000000..cf0c44f779a9ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -0,0 +1,358 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_QUANTIZECOMPOSITEFUNCTIONSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::mlir::stablehlo::DotGeneralOp;
+using ::mlir::stablehlo::UniformQuantizeOp;
+using ::tensorflow::quantization::RunPassesOnModuleOp;
+
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kEntryFuncAttrName = "_entry_function";
+
+class QuantizeCompositeFunctionsPass
+    : public impl::QuantizeCompositeFunctionsPassBase<
+          QuantizeCompositeFunctionsPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeCompositeFunctionsPass)
+
+  using impl::QuantizeCompositeFunctionsPassBase<
+      QuantizeCompositeFunctionsPass>::QuantizeCompositeFunctionsPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+// Returns true if `type` is a TensorType with quantized elements.
+bool IsQuantizedTensorType(const Type type) {
+  return type.isa<TensorType>() &&
+         type.cast<TensorType>().getElementType().isa<QuantizedType>();
+}
+
+// Checks if all inputs and outputs are quantized.
+bool HasQuantizedOperandOrOutput(Operation* call_op) {
+  SmallVector<Type> arg_types;
+  for (const Value arg : call_op->getOperands()) {
+    arg_types.push_back(arg.getType());
+  }
+
+  SmallVector<Type> output_types;
+  for (const Value output : call_op->getResults()) {
+    output_types.push_back(output.getType());
+  }
+
+  return absl::c_all_of(arg_types, IsQuantizedTensorType) &&
+         absl::c_all_of(output_types, IsQuantizedTensorType);
+}
+
+// Get the corresponding quantized function name from the given function name.
+// Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
+std::string GetQuantizedFunctionName(const StringRef func_name) {
+  return Twine(kQuantizedFuncPrefix)
+      .concat(func_name.rsplit(kCompositeFuncPrefix).second)
+      .str();
+}
+
+// Returns true if `xla_call_module_op` is quantized. To be considered
+// quantized, it should meet three conditions:
+// 1. At least one of the inputs or outputs should be a uniform quantized type.
+// 2. `xla_call_module_op` should have the `kQuantTraitAttrName` attribute.
+// 3. It should also have the `kEntryFuncAttrName` attribute, which points to
+//    the function that `xla_call_module_op` represents.
+bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
+  return HasQuantizedOperandOrOutput(xla_call_module_op) &&
+         xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
+         xla_call_module_op->hasAttr(kEntryFuncAttrName);
+}
+
+// Returns the entry function, i.e. the callee of `xla_call_module_op`.
+func::FuncOp GetEntryFuncOp(TF::XlaCallModuleOp xla_call_module_op,
+                            SymbolTable symbol_table) {
+  auto entry_function_symbol_ref =
+      xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>(kEntryFuncAttrName);
+
+  // Don't match if there are no DotGeneralOp.
+  // if (target_func_op.getOps<DotGeneralOp>().empty()) return {};
+  return dyn_cast_or_null<func::FuncOp>(
+      symbol_table.lookup(entry_function_symbol_ref.getValue()));
+}
+
+// Replaces the function type of `entry_func_op` to a quantized one, matching
+// the input and output types of `xla_call_module_op`.
+void SetQuantizedFunctionType(PatternRewriter& rewriter,
+                              func::FuncOp entry_func_op,
+                              TF::XlaCallModuleOp xla_call_module_op) {
+  SmallVector<Type> arg_types;
+  SmallVector<Location> arg_locs;
+  for (const Value arg : xla_call_module_op.getArgs()) {
+    arg_types.push_back(arg.getType());
+    arg_locs.push_back(arg.getLoc());
+  }
+
+  SmallVector<Type> output_types;
+  for (const Value output : xla_call_module_op.getOutput()) {
+    output_types.push_back(output.getType());
+  }
+
+  entry_func_op.setFunctionType(
+      rewriter.getFunctionType(arg_types, output_types));
+
+  // Replace argument types and locs.
+  Block& entry = entry_func_op->getRegion(0).front();
+  for (auto [arg, arg_type, arg_loc] :
+       llvm::zip_equal(entry.getArguments(), arg_types, arg_locs)) {
+    arg.setType(arg_type);
+    arg.setLoc(arg_loc);
+  }
+}
+
+// An interface representing patterns that quantizes an entry function's body.
+// The entry function's signatures should have already been quantized at the
+// point of rewriting.
+class EntryFuncBodyQuantizationPattern {
+ public:
+  virtual ~EntryFuncBodyQuantizationPattern() = default;
+
+  // Returns `success()` if `entry_func_op`'s body is eligible for rewriting. At
+  // this point `entry_func_op`'s signature has not been reset with quantized
+  // types.
+  virtual LogicalResult match(func::FuncOp entry_func_op) const = 0;
+
+  // Rewrites the `entry_func_op`'s body.
+  virtual void rewrite(func::FuncOp entry_func_op,
+                       PatternRewriter& rewriter) const = 0;
+};
+
+// Quantizes the entry function's body containing a `DotGeneralOp`.
+class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeDotGeneralOpPattern(MLIRContext& ctx) : ctx_(&ctx) {}
+
+  LogicalResult match(func::FuncOp entry_func_op) const override {
+    auto& operations = entry_func_op.getBody().front().getOperations();
+    return success(operations.size() == 2 &&
+                   isa<DotGeneralOp>(operations.front()));
+  }
+
+  void rewrite(func::FuncOp entry_func_op,
+               PatternRewriter& rewriter) const override {
+    // Update the output type of the dot_general op.
+    auto dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
+
+    const Type input_type = entry_func_op.getArgumentTypes()[0];
+    const Type rhs_type = entry_func_op.getArgumentTypes()[1];
+    const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+    const double input_scale = getElementTypeOrSelf(input_type)
+                                   .cast<UniformQuantizedType>()
+                                   .getScale();
+    const double rhs_scale =
+        getElementTypeOrSelf(rhs_type).cast<UniformQuantizedType>().getScale();
+
+    // Define the intermediate output type, which is an i32 quantized type.
+    // This is intermediate because the final output type of the entry_func_op
+    // should be an i8 quantized type.
+    const UniformQuantizedType output_quantized_element_type =
+        CreateI32F32UniformQuantizedType(dot_general_op->getLoc(), *ctx_,
+                                         input_scale * rhs_scale,
+                                         /*zero_point=*/0);
+
+    Value dot_general_op_result = dot_general_op->getResult(0);
+    const auto dot_general_op_result_type =
+        dot_general_op_result.getType().cast<RankedTensorType>();
+    const ArrayRef<int64_t> shape = dot_general_op_result_type.getShape();
+
+    const TensorType new_dot_general_op_result_type =
+        dot_general_op_result_type.cloneWith(shape,
+                                             output_quantized_element_type);
+    dot_general_op_result.setType(new_dot_general_op_result_type);
+
+    // Add i32 -> i8 requantization.
+    rewriter.setInsertionPointAfter(dot_general_op);
+    auto uniform_quant_op = rewriter.create<UniformQuantizeOp>(
+        dot_general_op->getLoc(), func_result_type,
+        dot_general_op->getResults());
+
+    auto return_op =
+        cast<func::ReturnOp>(entry_func_op.getBody().front().getTerminator());
+    return_op.setOperand(0, uniform_quant_op);
+  }
+
+ private:
+  MLIRContext* ctx_ = nullptr;
+};
+
+// Converts `entry_func_op` to be quantized according to the respective
+// inputs and outputs of `xla_call_module_op` that are possibly quantized. It
+// signature (type) is reset to match that of `xla_call_module_op`.
+// `entry_func_body_quantization_pattern` rewrites the function's body, based on
+// the new signature.
+void QuantizeEntryFuncOp(
+    MLIRContext& ctx, PatternRewriter& rewriter,
+    TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+  SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
+
+  body_rewrite_pattern.rewrite(entry_func_op, rewriter);
+
+  // Rename the function to be clear that the function has been quantized.
+  const std::string quantized_function_name =
+      GetQuantizedFunctionName(entry_func_op.getSymName());
+  entry_func_op.setSymName(quantized_function_name);
+}
+
+// Replaces a quantized `xla_call_module_op` with a `func::CallOp`. The callee
+// is expected to remain unquantized (thus having a signature mismatch), and it
+// is also quantized accordingly.
+void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+    MLIRContext& ctx, PatternRewriter& rewriter,
+    TF::XlaCallModuleOp xla_call_module_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+  auto module_op = xla_call_module_op->getParentOfType<ModuleOp>();
+  SymbolTable symbol_table(module_op);
+
+  func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
+  QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
+                      body_rewrite_pattern);
+
+  // Replace the XlaCallModuleOp with a new CallOp.
+  rewriter.setInsertionPoint(xla_call_module_op);
+  rewriter.replaceOpWithNewOp<func::CallOp>(xla_call_module_op, entry_func_op,
+                                            xla_call_module_op.getArgs());
+}
+
+// Pattern that mainly does two things:
+//
+//   1. Replaces quantized `TF::XlaCallModuleOp` with a `func::CallOp`.
+//   2. Quantizes the callee function.
+//
+// The inputs of this pattern assumes an invalid IR, where even if a
+// `TF::XlaCallModuleOp` is quantized the callee remains unquantized. Step (2)
+// not only replaces the input and output tensor types into quantized ones, but
+// also rewrites the body with a quantized equivalent.
+//
+// `FuncBodyRewritePatternT` defines how a function body is quantized and
+// rewritten.
+template <typename FuncBodyRewritePatternT,
+          typename = std::enable_if_t<std::is_base_of_v<
+              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
+class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit XlaCallModuleOpToCallOp(MLIRContext& ctx)
+      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {}
+
+  LogicalResult match(TF::XlaCallModuleOp op) const override {
+    auto module_op = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module_op);
+
+    // Ignore unquantized ops.
+    if (!IsQuantizedXlaCallModuleOp(op)) return failure();
+
+    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
+    if (!entry_func_op) {
+      op->emitError("Failed to find a valid entry function.");
+      return failure();
+    }
+
+    return FuncBodyRewritePatternT(*getContext()).match(entry_func_op);
+  }
+
+  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
+               PatternRewriter& rewriter) const override {
+    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+        *rewriter.getContext(), rewriter, xla_call_module_op,
+        FuncBodyRewritePatternT(*getContext()));
+  }
+};
+
+void QuantizeCompositeFunctionsPass::runOnOperation() {
+  MLIRContext& ctx = getContext();
+
+  QuantizationSpecs quant_specs;
+  quant_specs.inference_type = tensorflow::DT_QINT8;
+
+  PassManager pm(&ctx);
+  // Intermediate output from QuantizePass will have quantized ops
+  // (XlaCallModuleOps) with quantized input and output types, which are not
+  // allowed in the TF dialect.
+  pm.enableVerifier(false);
+
+  pm.addNestedPass<func::FuncOp>(CreatePrepareQuantizePass());
+  pm.addNestedPass<func::FuncOp>(CreateQuantizePass(quant_specs));
+  pm.addNestedPass<func::FuncOp>(createPostQuantizePass());
+
+  ModuleOp module_op = getOperation();
+  if (const absl::Status pm_run_status =
+          RunPassesOnModuleOp(mlir_dump_file_name_, pm, module_op);
+      !pm_run_status.ok()) {
+    signalPassFailure();
+  }
+
+  // TODO - b/307839649: Move this as a separate pass.
+  RewritePatternSet patterns(&ctx);
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(ctx);
+
+  if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
index 8cc75fbc261a20..5bf8ba7ec07657 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -78,7 +78,43 @@ std::string CreateStablehloFunctionName(const int id) {
   return Twine("_stablehlo_main_").concat(std::to_string(id)).str();
 }
 
-// Follows the structure of Live-variable analysis.
+// Follows the structure of Live-variable analysis. It is a form of
+// CFG (Control Flow Graph) analysis, often used in compilers.
+//
+// A variable is live if it holds a value that may be used in the future.
+// It is live-in at node n if it is live on any of the node's in-edges.
+// It is live-out at node n if it is live on any of the node's out-edges.
+// def[n] refers to values that are defined at node n.
+// use[n] refers to values that are used at node n.
+//
+// Given a node n, variables' liveliness is defined like the following:
+// live_in[n] = use[n] U (live_out[n] - def[n])
+// live_out[n] = U {live_in[s] | s ε succ[n]}
+//
+// Consider a sequence of op:
+//
+// ```
+// node 1: %0 = stablehlo.constant
+// node 2: %1 = stablehlo.constant
+// node 3: %2 = stablehlo.add %0, %1
+// node 4: %3 = stablehlo.multiply %2, %1
+// node 5: return %3
+// ```
+//
+// In Backward Liveliness analysis, the liveliness for each node above becomes:
+// live_in[5] = use[5]   U (live_out[5] - def[5])
+//            = {%3}     U ({∅} - {∅})            = {%3}
+// live_in[4] = use[4]   U (live_out[4] - def[4])
+//            = {%1, %2} U ({%3} - {%3})          = {%1, %2}
+// live_in[3] = use[3]   U (live_out[3] - def[3])
+//            = {%0, %1} U ({%1, %2} - {%2})      = {%0, %1}
+// live_in[2] = use[2]   U (live_out[2] - def[2])
+//            = {∅}      U ({%0, %1} - {%1})      = {%0}
+// live_in[1] = use[1]   U (live_out[1] - def[1])
+//            = {∅}      U ({%0} - {%0})          = {∅}
+//
+// This analogy is used throughout this pass to ensure only live edges form
+// proper subgraphs.
 class LiveOuts {
  public:
   LiveOuts() = default;
@@ -100,10 +136,10 @@ class LiveOuts {
   void snapshot_previous_state() { prev_liveouts_ = liveouts_; }
 
   // Return the current live values.
-  DenseSet<Value>& get() { return liveouts_; }
+  const DenseSet<Value>& get() const { return liveouts_; }
 
   // Return the previous live values.
-  DenseSet<Value>& get_previous() { return prev_liveouts_; }
+  const DenseSet<Value>& get_previous() const { return prev_liveouts_; }
 
  private:
   DenseSet<Value> liveouts_;
@@ -212,6 +248,38 @@ void ReplaceStablehloOpsWithXlaCallModuleOp(
   }
 }
 
+// Contains the actual logic for updating states and replacing StableHLO ops
+// with tf.XlaCallModuleOps.
+void UpdateStatesAndReplaceStablehloOps(
+    const DenseSet<Value>& operands, const DenseSet<Value>& defined_values,
+    const LiveOuts& liveouts, ModuleOp module_op,
+    ArrayRef<Operation*> reverse_subgraph, const int stablehlo_func_id,
+    func::FuncOp main_func, const bool is_last_subgraph = false) {
+  DenseSet<Value> inputs = operands;
+  for (Value defined_value : defined_values) {
+    inputs.erase(defined_value);
+  }
+
+  DenseSet<Value> outputs = liveouts.get_previous();
+  for (Value live_value : liveouts.get()) {
+    outputs.erase(live_value);
+  }
+
+  if (is_last_subgraph) {
+    // Additionally remove arguments from the outputs, as it provides liveness
+    // throughout (functions as an invisible op above the very first op that
+    // returns the arguments).
+    for (const BlockArgument arg : main_func.getArguments()) {
+      outputs.erase(arg);
+    }
+  }
+
+  ReplaceStablehloOpsWithXlaCallModuleOp(
+      SmallVector<Value>(inputs.begin(), inputs.end()),
+      SmallVector<Value>(outputs.begin(), outputs.end()), reverse_subgraph,
+      stablehlo_func_id, module_op);
+}
+
 // Replaces the StableHLO ops in the main function block with
 // tf.XlaCallModuleOps as separate subgraphs. Wires them back to the main
 // function block to be compatible with SavedModel structure.
@@ -241,20 +309,14 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
   DenseSet<Value> operands;
   DenseSet<Value> defined_values;
 
-  int stablehlo_func_id = 0;
+  int stablehlo_func_id = -1;
   for (Operation* op : reverse_main_func_block_ops) {
     if (!IsStablehloOp(op)) {
       // Create an XlaCallModuleOp if reverse_subgraph isn't empty.
       if (!reverse_subgraph.empty()) {
-        DenseSet<Value> outputs = liveouts.get_previous();
-        for (Value live_value : liveouts.get()) {
-          outputs.erase(live_value);
-        }
-
-        ReplaceStablehloOpsWithXlaCallModuleOp(
-            SmallVector<Value>(operands.begin(), operands.end()),
-            SmallVector<Value>(outputs.begin(), outputs.end()),
-            reverse_subgraph, stablehlo_func_id++, module_op);
+        UpdateStatesAndReplaceStablehloOps(operands, defined_values, liveouts,
+                                           module_op, reverse_subgraph,
+                                           ++stablehlo_func_id, main_func);
 
         // Reset states and start a new subgraph.
         reverse_subgraph.clear();
@@ -273,25 +335,16 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
     }
 
     reverse_subgraph.push_back(op);
+
+    defined_values.insert(op->getResults().begin(), op->getResults().end());
+    operands.insert(op->getOperands().begin(), op->getOperands().end());
   }
 
   // Create the last subgraph if it isn't empty.
   if (!reverse_subgraph.empty()) {
-    DenseSet<Value> outputs = liveouts.get_previous();
-    for (Value live_value : liveouts.get()) {
-      outputs.erase(live_value);
-    }
-    // Additionally remove arguments from the outputs, as it provides liveness
-    // throughout (functions as an invisible op above the very first op that
-    // returns the arguments).
-    for (const BlockArgument arg : main_func.getArguments()) {
-      outputs.erase(arg);
-    }
-
-    ReplaceStablehloOpsWithXlaCallModuleOp(
-        SmallVector<Value>(operands.begin(), operands.end()),
-        SmallVector<Value>(outputs.begin(), outputs.end()), reverse_subgraph,
-        stablehlo_func_id++, module_op);
+    UpdateStatesAndReplaceStablehloOps(
+        operands, defined_values, liveouts, module_op, reverse_subgraph,
+        ++stablehlo_func_id, main_func, /*is_last_subgraph=*/true);
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td
index 8c6ab88b3368e9..744637d58d8760 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td
@@ -24,3 +24,7 @@ class FloatValueEquals<string val> : Constraint<CPred<
 
 // Fetches the default or null attribute, used for pattern matching.
 def DefaultOrNullAttr : NativeCodeCall<"DefaultOrNullAttr($_builder, $0)">;
+
+// Returns true if the given op is a StableHLO constant op.
+def IsStableHLOConstantOp : Constraint<CPred<"dyn_cast_or_null<::mlir::stablehlo::ConstantOp>($0.getDefiningOp())">>;
+
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 9f9c11454c5c2a..cb8ad65a5cdd48 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -106,6 +106,53 @@ func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<
 
 // -----
 
+// CHECK-LABEL: func @quantize_per_channel
+func.func @quantize_per_channel(%arg0: tensor<26x26x3x2xf32>
+    ) -> tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>> {
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[1.100000e+00, 1.100000e-01]>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[-1.000000e+01, 2.000000e+00]>
+  // CHECK-DAG: %[[QMIN:.*]] = mhlo.constant dense<-2.14748365E+9> : tensor<f32>
+  // CHECK-DAG: %[[QMAX:.*]] = mhlo.constant dense<2.14748365E+9> : tensor<f32>
+  // CHECK: %[[DIVIDE:.*]] = chlo.broadcast_divide %arg0, %[[SCALES]]
+  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
+  // CHECK: %[[ADD:.*]] = chlo.broadcast_add %[[DIVIDE]], %[[ZPS]]
+  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
+  // CHECK: %[[CLAMP:.*]] = mhlo.clamp %[[QMIN]], %[[ADD]], %[[QMAX]]
+  // CHECK: %[[ROUND:.*]] = mhlo.round_nearest_even %[[CLAMP]]
+  // CHECK: %[[RESULT:.*]] = mhlo.convert %[[ROUND]]
+  // CHECK-SAME: (tensor<26x26x3x2xf32>) -> tensor<26x26x3x2xi32>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<26x26x3x2xf32>
+      ) -> tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>>
+  return %0 : tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>>
+}
+
+// -----
+
+// CHECK-LABEL: func @dequantize_per_channel
+func.func @dequantize_per_channel(
+    %arg0: tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>>
+  ) -> tensor<26x26x3x2xf32> {
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[1.100000e+00, 1.100000e-01]>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[-10, 2]> : tensor<2xi32>
+  // CHECK: %[[SUBTRACT:.*]] = chlo.broadcast_subtract
+  // CHECK-SAME: %[[INPUT:.*]], %[[ZPS]]
+  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: (tensor<26x26x3x2xi32>, tensor<2xi32>) -> tensor<26x26x3x2xi32>
+  // CHECK: %[[FLOAT:.*]] = mhlo.convert %[[SUBTRACT]]
+  // CHECK: %[[RESULT:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[FLOAT]], %[[SCALES]]
+  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
+  %0 = mhlo.uniform_dequantize %arg0 : (
+      tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>>
+    ) -> tensor<26x26x3x2xf32>
+  return %0 : tensor<26x26x3x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @add
 func.func @add(
     %arg0: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>,
@@ -173,17 +220,11 @@ func.func @add_different_lhs_type(
     %arg0: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+01:3>>,
     %arg1: tensor<?x?x!quant.uniform<i8:f32, 5.000000e+00:1>>
   ) -> tensor<?x?x!quant.uniform<i8:f32, 5.000000e+00:1>> {
-  // CHECK: %[[VAL1:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[INPUT_ZPS:.*]] = mhlo.constant dense<3> : tensor<i32>
-  // CHECK: %[[VAL2:.*]] = chlo.broadcast_subtract %[[VAL1]], %[[INPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[MULTIPLIER:.*]] = mhlo.constant dense<16384> : tensor<i32>
-  // CHECK-DAG: %[[TOTAL_SHIFT:.*]] = mhlo.constant dense<13> : tensor<i32>
-  // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<4096> : tensor<i32>
-  // CHECK: %[[VAL3:.*]] = chlo.broadcast_multiply %[[VAL2]], %[[MULTIPLIER]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[HALF]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL5:.*]] = chlo.broadcast_shift_right_arithmetic %[[VAL4]], %[[TOTAL_SHIFT]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[OUTPUT_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %[[LHS_32_REQ:.*]] = chlo.broadcast_add %[[VAL5]], %[[OUTPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[LHS:.*]] = mhlo.convert %arg0 : (tensor<?x?xi8>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[MUL:.*]] = chlo.broadcast_multiply %[[LHS]], %[[COMBINED_SCALE]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[COMBINED_ZP:.*]] = mhlo.constant dense<-5.000000e+00>
+  // CHECK: %[[LHS_32:.*]] = chlo.broadcast_add %[[MUL]], %[[COMBINED_ZP]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
 
   // CHECK-DAG: %[[RHS_32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
   // CHECK-DAG: %[[RES_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
@@ -207,18 +248,11 @@ func.func @add_different_rhs_type(
     %arg0: tensor<?x?x!quant.uniform<i8:f32, 5.000000e+00:1>>,
     %arg1: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+01:3>>
   ) -> tensor<?x?x!quant.uniform<i8:f32, 5.000000e+00:1>> {
-  // CHECK: %[[VAL0:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL1:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[INPUT_ZPS:.*]] = mhlo.constant dense<3> : tensor<i32>
-  // CHECK: %[[VAL2:.*]] = chlo.broadcast_subtract %[[VAL1]], %[[INPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[MULTIPLIER:.*]] = mhlo.constant dense<16384> : tensor<i32>
-  // CHECK-DAG: %[[TOTAL_SHIFT:.*]] = mhlo.constant dense<13> : tensor<i32>
-  // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<4096> : tensor<i32>
-  // CHECK: %[[VAL3:.*]] = chlo.broadcast_multiply %[[VAL2]], %[[MULTIPLIER]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[HALF]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL5:.*]] = chlo.broadcast_shift_right_arithmetic %[[VAL4]], %[[TOTAL_SHIFT]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[OUTPUT_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %[[RHS_32_REQ:.*]] = chlo.broadcast_add %[[VAL5]], %[[OUTPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.convert %arg1 : (tensor<?x?xi8>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[MUL:.*]] = chlo.broadcast_multiply %[[RHS]], %[[COMBINED_SCALE]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[COMBINED_ZP:.*]] = mhlo.constant dense<-5.000000e+00>
+  // CHECK: %[[RHS_32:.*]] = chlo.broadcast_add %[[MUL]], %[[COMBINED_ZP]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
 
   // CHECK-DAG: %[[RES_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
   // CHECK-DAG: %[[VAL7:.*]] = chlo.broadcast_add %[[LHS_32:.*]], %[[RHS_32_REQ:.*]] : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
@@ -239,29 +273,17 @@ func.func @add_different_res_type(
     %arg0: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+01:3>>,
     %arg1: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+01:3>>
   ) -> tensor<?x?x!quant.uniform<i8:f32, 5.000000e+00:1>> {
-  // CHECK: %[[VAL1:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[INPUT_ZPS:.*]] = mhlo.constant dense<3> : tensor<i32>
-  // CHECK: %[[VAL2:.*]] = chlo.broadcast_subtract %[[VAL1]], %[[INPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[MULTIPLIER:.*]] = mhlo.constant dense<16384> : tensor<i32>
-  // CHECK-DAG: %[[TOTAL_SHIFT:.*]] = mhlo.constant dense<13> : tensor<i32>
-  // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<4096> : tensor<i32>
-  // CHECK: %[[VAL3:.*]] = chlo.broadcast_multiply %[[VAL2]], %[[MULTIPLIER]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[HALF]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL5:.*]] = chlo.broadcast_shift_right_arithmetic %[[VAL4]], %[[TOTAL_SHIFT]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[OUTPUT_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %[[LHS_32_REQ:.*]] = chlo.broadcast_add %[[VAL5]], %[[OUTPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-
-  // CHECK: %[[VAL6:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[INPUT_ZPS:.*]] = mhlo.constant dense<3> : tensor<i32>
-  // CHECK: %[[VAL7:.*]] = chlo.broadcast_subtract %[[VAL6]], %[[INPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[MULTIPLIER:.*]] = mhlo.constant dense<16384> : tensor<i32>
-  // CHECK-DAG: %[[TOTAL_SHIFT:.*]] = mhlo.constant dense<13> : tensor<i32>
-  // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<4096> : tensor<i32>
-  // CHECK: %[[VAL8:.*]] = chlo.broadcast_multiply %[[VAL7]], %[[MULTIPLIER]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL9:.*]] = chlo.broadcast_add %[[VAL8]], %[[HALF]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL10:.*]] = chlo.broadcast_shift_right_arithmetic %[[VAL9]], %[[TOTAL_SHIFT]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK-DAG: %[[OUTPUT_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %[[RHS_32_REQ:.*]] = chlo.broadcast_add %[[VAL10]], %[[OUTPUT_ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[LHS:.*]] = mhlo.convert %arg0 : (tensor<?x?xi8>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[MUL:.*]] = chlo.broadcast_multiply %[[LHS]], %[[COMBINED_SCALE]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[COMBINED_ZP:.*]] = mhlo.constant dense<-5.000000e+00>
+  // CHECK: %[[LHS_32_REQ:.*]] = chlo.broadcast_add %[[MUL]], %[[COMBINED_ZP]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+
+  // CHECK-DAG: %[[COMBINED_SCALE:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.convert %arg1 : (tensor<?x?xi8>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[MUL:.*]] = chlo.broadcast_multiply %[[RHS]], %[[COMBINED_SCALE]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK-DAG: %[[COMBINED_ZP:.*]] = mhlo.constant dense<-5.000000e+00>
+  // CHECK: %[[RHS_32_REQ:.*]] = chlo.broadcast_add %[[MUL]], %[[COMBINED_ZP]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
 
   // CHECK-DAG: %[[RES_ZPS:.*]] = mhlo.constant dense<1> : tensor<i32>
   // CHECK-DAG: %[[VAL11:.*]] = chlo.broadcast_add %[[LHS_32_REQ:.*]], %[[RHS_32_REQ:.*]] : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
@@ -1413,6 +1435,17 @@ func.func @mhlo_constant_uniform_quantized() -> tensor<1x!quant.uniform<i8:f32,
   return %0 : tensor<1x!quant.uniform<i8:f32, 1.000000e+00:3>>
 }
 
+// -----
+
+// CHECK-LABEL: func @mhlo_constant_uniform_quantized_per_channel
+func.func @mhlo_constant_uniform_quantized_per_channel() -> () {
+  // CHECK: mhlo.constant dense<[9, 4]> : tensor<2xi8>
+  %0 = mhlo.constant() {value = dense<[9, 4]> : tensor<2xi8>} : ()
+      -> tensor<2x!quant.uniform<i8:f32:0, {1.000000e+00:3, 2.000000e+00:-2}>>
+  return
+}
+
+
 // -----
 
 // CHECK-LABEL: func @mhlo_constant_int
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-tf-quant-types.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-tf-quant-types.mlir
index c83f95ce90cdcc..73555e6913b91d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-tf-quant-types.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-tf-quant-types.mlir
@@ -11,17 +11,17 @@ func.func @relu_qint8(%arg0: tensor<1x!tf_type.qint8>) -> tensor<1x!tf_type.qint
 
 // CHECK-LABEL: func @if_qint8(%arg0: tensor<i1>, %arg1: tensor<1xi8>, %arg2: tensor<1xi8>) -> tensor<1xi8>
 func.func @if_qint8(%arg0: tensor<i1>, %arg1: tensor<1x!tf_type.qint8>, %arg2: tensor<1x!tf_type.qint8>) -> tensor<1x!tf_type.qint8> {
-  // CHECK-NEXT: %0 = "tf.IfRegion"(%arg0) ({
+  // CHECK-NEXT: %0 = "tf.IfRegion"(%arg0) <{is_stateless = false}> ({
   // CHECK-NEXT:   "tf.Yield"(%arg1) : (tensor<1xi8>) -> ()
   // CHECK-NEXT:   }, {
   // CHECK-NEXT:   "tf.Yield"(%arg2) : (tensor<1xi8>) -> ()
-  // CHECK-NEXT:  }) {is_stateless = false} : (tensor<i1>) -> tensor<1xi8>
+  // CHECK-NEXT:  }) : (tensor<i1>) -> tensor<1xi8>
   // CHECK-NEXT: return %0 : tensor<1xi8>
-  %0 = "tf.IfRegion"(%arg0) ({
+  %0 = "tf.IfRegion"(%arg0) <{is_stateless = false}> ({
     "tf.Yield"(%arg1) : (tensor<1x!tf_type.qint8>) -> ()
     }, {
     "tf.Yield"(%arg2) : (tensor<1x!tf_type.qint8>) -> ()
-   }) {is_stateless = false} : (tensor<i1>) -> tensor<1x!tf_type.qint8>
+   }) : (tensor<i1>) -> tensor<1x!tf_type.qint8>
   func.return %0 : tensor<1x!tf_type.qint8>
 }
 
@@ -74,7 +74,7 @@ func.func @uniform_quantize(%arg0: tensor<1xf32>) -> tensor<1x!tf_type.qint8>
   %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK: %[[qint:.*]] = "tf.UniformQuantize"
-  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint]]) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
+  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint]]) <{Truncate = false}> : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
   %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
@@ -92,7 +92,7 @@ func.func @uniform_quantize_no_return(%arg0: tensor<1xf32>) -> ()
   %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK: %[[qint:.*]] = "tf.UniformQuantize"
-  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint]]) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
+  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint]]) <{Truncate = false}> : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
   %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
@@ -109,7 +109,7 @@ func.func @uniform_dequantize(%arg0: tensor<1x!tf_type.qint8>) -> tensor<1xf32>
   %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: %[[x:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
+  // CHECK: %[[x:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
   // CHECK: %[[y:.*]] = "tf.UniformDequantize"(%[[x]]
   %0 = "tf.UniformDequantize"(%arg0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
@@ -132,8 +132,8 @@ func.func @uniform_quantize_dequantize(%arg0: tensor<1xf32>) -> tensor<1xf32>
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
 
-  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint0]]) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
-  // CHECK: %[[qint1:.*]] = "tf.Cast"(%[[int]]) {Truncate = false} : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
+  // CHECK: %[[int:.*]] = "tf.Cast"(%[[qint0]]) <{Truncate = false}> : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
+  // CHECK: %[[qint1:.*]] = "tf.Cast"(%[[int]]) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
   // CHECK: %[[res:.*]] = "tf.UniformDequantize"(%[[qint1]]
   %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
@@ -155,10 +155,10 @@ func.func @uniform_quantized_add(%arg0: tensor<2x!tf_type.qint32>, %arg1: tensor
     %output_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
     %output_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
 
-    // CHECK: %[[lhs:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2xi32>) -> tensor<2x!tf_type.qint32>
-    // CHECK: %[[rhs:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xi32>) -> tensor<2x!tf_type.qint32>
+    // CHECK: %[[lhs:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<2xi32>) -> tensor<2x!tf_type.qint32>
+    // CHECK: %[[rhs:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<2xi32>) -> tensor<2x!tf_type.qint32>
     // CHECK: %[[res_qint:.*]] = "tf.UniformQuantizedAdd"(%[[lhs]], %[[rhs]]
-    // CHECK: %[[res_int:.*]] = "tf.Cast"(%[[res_qint]]) {Truncate = false} : (tensor<2x!tf_type.qint32>) -> tensor<2xi32>
+    // CHECK: %[[res_int:.*]] = "tf.Cast"(%[[res_qint]]) <{Truncate = false}> : (tensor<2x!tf_type.qint32>) -> tensor<2xi32>
     // CHECK: return %[[res_int]] : tensor<2xi32>
     %1 = "tf.UniformQuantizedAdd"(
       %arg0, %arg1,
@@ -190,13 +190,13 @@ func.func @while_region_qint(%arg0: tensor<2x2xf32>) -> (tensor<2x?xf32>, tensor
   %zps4 = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK: %[[qint_0:.*]] = "tf.UniformQuantize"
-  // CHECK: %[[int_0:.*]] = "tf.Cast"(%[[qint_0]]) {Truncate = false} : (tensor<2x2x!tf_type.qint8>) -> tensor<2x2xi8>
+  // CHECK: %[[int_0:.*]] = "tf.Cast"(%[[qint_0]]) <{Truncate = false}> : (tensor<2x2x!tf_type.qint8>) -> tensor<2x2xi8>
   %0 = "tf.UniformQuantize"(%arg0, %scales, %zps2) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<2x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x2x!tf_type.qint8>
 
   // CHECK: %[[qint_1:.*]] = "tf.UniformQuantize"
-  // CHECK: %[[int_1:.*]] = "tf.Cast"(%[[qint_1]]) {Truncate = false} : (tensor<2x2x!tf_type.qint8>) -> tensor<2x2xi8>
+  // CHECK: %[[int_1:.*]] = "tf.Cast"(%[[qint_1]]) <{Truncate = false}> : (tensor<2x2x!tf_type.qint8>) -> tensor<2x2xi8>
   %1 = "tf.UniformQuantize"(%arg0, %scales, %zps4) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<2x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x2x!tf_type.qint8>
@@ -212,11 +212,11 @@ func.func @while_region_qint(%arg0: tensor<2x2xf32>) -> (tensor<2x?xf32>, tensor
     "tf.Yield"(%id, %barg1) : (tensor<2x?x!tf_type.qint8>, tensor<?x2x!tf_type.qint8>) -> ()
   }) {is_stateless = false} : (tensor<2x2x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>) -> (tensor<2x?x!tf_type.qint8>, tensor<?x2x!tf_type.qint8>)
 
-  // CHECK: %[[out_qint_0:.*]] = "tf.Cast"(%[[while_result]]#0) {Truncate = false} : (tensor<2x?xi8>) -> tensor<2x?x!tf_type.qint8>
+  // CHECK: %[[out_qint_0:.*]] = "tf.Cast"(%[[while_result]]#0) <{Truncate = false}> : (tensor<2x?xi8>) -> tensor<2x?x!tf_type.qint8>
   // CHECK: %[[out_f_0:.*]] = "tf.UniformDequantize"(%[[out_qint_0]]
   %3 = "tf.UniformDequantize"(%2#0, %scales, %zps2) {quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64} : (tensor<2x?x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<2x?xf32>
 
-  // CHECK: %[[out_qint_1:.*]] = "tf.Cast"(%[[while_result]]#1) {Truncate = false} : (tensor<?x2xi8>) -> tensor<?x2x!tf_type.qint8>
+  // CHECK: %[[out_qint_1:.*]] = "tf.Cast"(%[[while_result]]#1) <{Truncate = false}> : (tensor<?x2xi8>) -> tensor<?x2x!tf_type.qint8>
   // CHECK: %[[out_f_1:.*]] = "tf.UniformDequantize"(%[[out_qint_1]]
   %4 = "tf.UniformDequantize"(%2#1, %scales, %zps4) {quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64} : (tensor<?x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<?x2xf32>
 
@@ -234,7 +234,7 @@ func.func @concat_uniform_quantize(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32
 
   // CHECK: %[[input:.*]] = "tf.ConcatV2"(%arg0, %arg1
   // CHECK: %[[output_qint:.*]] = "tf.UniformQuantize"(%[[input]]
-  // CHECK: %[[output:.*]] = "tf.Cast"(%[[output_qint]]) {Truncate = false} : (tensor<6x3x!tf_type.qint8>) -> tensor<6x3xi8>
+  // CHECK: %[[output:.*]] = "tf.Cast"(%[[output_qint]]) <{Truncate = false}> : (tensor<6x3x!tf_type.qint8>) -> tensor<6x3xi8>
   // CHECK: return %[[output]] : tensor<6x3xi8>
   %0 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
   %1 = "tf.UniformQuantize"(%0, %scales, %zps) {
@@ -252,7 +252,7 @@ func.func @concat_uniform_dequantize(%arg0: tensor<3x3x!tf_type.qint8>, %arg1: t
   %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK: %[[input:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[VAL:.*]]) : (tensor<3x3xi8>, tensor<3x3xi8>, tensor<i64>) -> tensor<6x3xi8>
-  // CHECK: %[[input_qint:.*]] = "tf.Cast"(%[[input]]) {Truncate = false} : (tensor<6x3xi8>) -> tensor<6x3x!tf_type.qint8>
+  // CHECK: %[[input_qint:.*]] = "tf.Cast"(%[[input]]) <{Truncate = false}> : (tensor<6x3xi8>) -> tensor<6x3x!tf_type.qint8>
   // CHECK: %[[output:.*]] = "tf.UniformDequantize"(%[[input_qint]]
   // CHECK: return %[[output]] : tensor<6x3xf32>
   %0 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3x!tf_type.qint8>, tensor<3x3x!tf_type.qint8>, tensor<i64>) -> tensor<6x3x!tf_type.qint8>
@@ -266,7 +266,7 @@ func.func @concat_uniform_dequantize(%arg0: tensor<3x3x!tf_type.qint8>, %arg1: t
 
 // CHECK-LABEL: func @tf_const_qint32
 func.func @tf_const_qint32() -> tensor<1x!tf_type.qint32> {
-  // CHECK: %[[result:.*]] = "tf.Const"() {value = dense<127> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[result:.*]] = "tf.Const"() <{value = dense<127> : tensor<1xi32>}> : () -> tensor<1xi32>
   %0 = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<1x!tf_type.qint32> } : () -> tensor<1x!tf_type.qint32>
   // CHECK: return %[[result]] : tensor<1xi32>
   func.return %0 :  tensor<1x!tf_type.qint32>
@@ -276,7 +276,7 @@ func.func @tf_const_qint32() -> tensor<1x!tf_type.qint32> {
 
 // CHECK-LABEL: func @tf_const_qint8
 func.func @tf_const_qint8() -> tensor<2x!tf_type.qint8> {
-  // CHECK: %[[result:.*]] = "tf.Const"() {value = dense<[127, 18]> : tensor<2xi8>} : () -> tensor<2xi8>
+  // CHECK: %[[result:.*]] = "tf.Const"() <{value = dense<[127, 18]> : tensor<2xi8>}> : () -> tensor<2xi8>
   %0 = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656e736f722464747970653a2044545f51494e54382074656e736f725f7368617065207b2064696d207b2073697a653a2032207d207d2074656e736f725f636f6e74656e743a20225c3137375c30323222"> : tensor<2x!tf_type.qint8> } : () -> tensor<2x!tf_type.qint8>
   // CHECK: return %[[result]] : tensor<2xi8>
   func.return %0 :  tensor<2x!tf_type.qint8>
@@ -295,7 +295,7 @@ func.func @tf_const_invalid_proto() -> tensor<2x!tf_type.qint32> {
 
 // CHECK-LABEL: func @cast_op_qint32_int32
 func.func @cast_op_qint32_int32(%arg0: tensor<1x!tf_type.qint32>) -> tensor<1xi32> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x!tf_type.qint32>) -> tensor<1xi32>
   func.return %0: tensor<1xi32>
 }
@@ -304,7 +304,7 @@ func.func @cast_op_qint32_int32(%arg0: tensor<1x!tf_type.qint32>) -> tensor<1xi3
 
 // CHECK-LABEL: func @cast_op_int32_qint32
 func.func @cast_op_int32_qint32(%arg0: tensor<1xi32>) -> tensor<1x!tf_type.qint32> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1x!tf_type.qint32>
   func.return %0: tensor<1x!tf_type.qint32>
 }
@@ -313,7 +313,7 @@ func.func @cast_op_int32_qint32(%arg0: tensor<1xi32>) -> tensor<1x!tf_type.qint3
 
 // CHECK-LABEL: func @cast_op_qint8_int8
 func.func @cast_op_qint8_int8(%arg0: tensor<1x!tf_type.qint8>) -> tensor<1xi8> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1xi8>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1xi8>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
   func.return %0: tensor<1xi8>
 }
@@ -322,7 +322,7 @@ func.func @cast_op_qint8_int8(%arg0: tensor<1x!tf_type.qint8>) -> tensor<1xi8> {
 
 // CHECK-LABEL: func @cast_op_int8_qint8
 func.func @cast_op_int8_qint8(%arg0: tensor<1xi8>) -> tensor<1x!tf_type.qint8> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1xi8>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1xi8>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
   func.return %0: tensor<1x!tf_type.qint8>
 }
@@ -331,7 +331,7 @@ func.func @cast_op_int8_qint8(%arg0: tensor<1xi8>) -> tensor<1x!tf_type.qint8> {
 
 // CHECK-LABEL: func @cast_op_qint32_int8
 func.func @cast_op_qint32_int8(%arg0: tensor<1x!tf_type.qint32>) -> tensor<1xi8> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi8>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi8>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x!tf_type.qint32>) -> tensor<1xi8>
   func.return %0: tensor<1xi8>
 }
@@ -340,7 +340,7 @@ func.func @cast_op_qint32_int8(%arg0: tensor<1x!tf_type.qint32>) -> tensor<1xi8>
 
 // CHECK-LABEL: func @cast_op_int8_qint32
 func.func @cast_op_int8_qint32(%arg0: tensor<1xi8>) -> tensor<1x!tf_type.qint32> {
-  // CHECK: "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1xi32>
+  // CHECK: "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1xi32>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi8>) -> tensor<1x!tf_type.qint32>
   func.return %0: tensor<1x!tf_type.qint32>
 }
@@ -353,10 +353,10 @@ func.func @cast_uniform_dequantize(%arg0: tensor<1x!tf_type.qint32>) -> tensor<1
   %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: %[[x:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi8>
+  // CHECK: %[[x:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi8>
   %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x!tf_type.qint32>) -> tensor<1x!tf_type.qint8>
 
-  // CHECK: %[[y:.*]] = "tf.Cast"(%[[x]]) {Truncate = false} : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
+  // CHECK: %[[y:.*]] = "tf.Cast"(%[[x]]) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1x!tf_type.qint8>
   // CHECK: %[[z:.*]] = "tf.UniformDequantize"(%[[y]]
   %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
@@ -379,8 +379,8 @@ func.func @uniform_quantize_cast(%arg0: tensor<1xf32>) -> tensor<1x!tf_type.qint
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
 
-  // CHECK: %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
-  // CHECK: %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1xi8>) -> tensor<1xi32>
+  // CHECK: %1 = "tf.Cast"(%0) <{Truncate = false}> : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
+  // CHECK: %2 = "tf.Cast"(%1) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1xi32>
   %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1x!tf_type.qint32>
 
   // CHECK: return %2 : tensor<1xi32>
@@ -398,15 +398,15 @@ func.func @uniform_quantize_cast_dequantize(%arg0: tensor<1xf32>) -> tensor<1xf3
   %zps1 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK: %[[qint_1:.*]] = "tf.UniformQuantize"
-  // CHECK: %[[int_1:.*]] = "tf.Cast"(%[[qint_1]]) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
+  // CHECK: %[[int_1:.*]] = "tf.Cast"(%[[qint_1]]) <{Truncate = false}> : (tensor<1x!tf_type.qint8>) -> tensor<1xi8>
   %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
   } : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
 
-  // CHECK: %[[int_2:.*]] = "tf.Cast"(%[[int_1]]) {Truncate = false} : (tensor<1xi8>) -> tensor<1xi32>
+  // CHECK: %[[int_2:.*]] = "tf.Cast"(%[[int_1]]) <{Truncate = false}> : (tensor<1xi8>) -> tensor<1xi32>
   %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<1x!tf_type.qint8>) -> tensor<1x!tf_type.qint32>
 
-  // CHECK: %[[qint_2:.*]] = "tf.Cast"(%[[int_2]]) {Truncate = false} : (tensor<1xi32>) -> tensor<1x!tf_type.qint32>
+  // CHECK: %[[qint_2:.*]] = "tf.Cast"(%[[int_2]]) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1x!tf_type.qint32>
   // CHECK: %[[int_3:.*]] = "tf.UniformDequantize"(%[[qint_2]]
   %2 = "tf.UniformDequantize"(%1, %scales1, %zps1) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
@@ -423,10 +423,10 @@ func.func @uniform_quantize_clip_min_cast(%arg0: tensor<1x2x2x1x!tf_type.qint32>
   %scale = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
   %q_min = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
 
-  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
+  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
   // CHECK: "tf.UniformQuantizedClipByValue"(%[[INPUT_QINT]], %[[MIN_QINT]], %[[MIN_QINT]]
   %output = "tf.UniformQuantizedClipByValue"(%arg0, %q_min, %q_min, %scale, %zp)
     {quantization_axis = -1 : i64, quantization_max_val = 2147483647 : i64, quantization_min_val = -2147483648 : i64} :
@@ -441,11 +441,11 @@ func.func @uniform_quantize_clip_input_cast(%arg0: tensor<1x2x2x1xi32>, %arg1: t
   %scale = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
+  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
   %q_input = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
 
-  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
-  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
   // CHECK: "tf.UniformQuantizedClipByValue"(%[[INPUT_QINT]], %[[MIN_QINT]], %[[MAX_QINT]]
   %output = "tf.UniformQuantizedClipByValue"(%q_input, %arg1, %arg1, %scale, %zp)
     {quantization_axis = -1 : i64, quantization_max_val = 2147483647 : i64, quantization_min_val = -2147483648 : i64} :
@@ -460,15 +460,15 @@ func.func @uniform_quantize_clip_output_cast(%arg0: tensor<1x2x2x1x!tf_type.qint
   %scale = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
-  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
-  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
+  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
   // CHECK: %[[OUTPUT_QINT:.*]] = "tf.UniformQuantizedClipByValue"(%[[INPUT_QINT]], %[[MIN_QINT]], %[[MAX_QINT]]
   %q_output = "tf.UniformQuantizedClipByValue"(%arg0, %arg1, %arg1, %scale, %zp)
     {quantization_axis = -1 : i64, quantization_max_val = 2147483647 : i64, quantization_min_val = -2147483648 : i64} :
     (tensor<1x2x2x1x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x1x!tf_type.qint32>
 
-  // CHECK: %[[OUTPUT:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) {Truncate = false} : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
+  // CHECK: %[[OUTPUT:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) <{Truncate = false}> : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
   %output = "tf.Cast"(%q_output) {Truncate = false} : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
 
   return %output : tensor<1x2x2x1xi32>
@@ -481,19 +481,19 @@ func.func @uniform_quantize_clip_output_cast_multiple_uses(%arg0: tensor<1x2x2x1
   %scale = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %zp = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
-  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
-  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[INPUT_QINT:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
+  // CHECK-DAG: %[[MIN_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
+  // CHECK-DAG: %[[MAX_QINT:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<i32>) -> tensor<!tf_type.qint32>
   // CHECK: %[[OUTPUT_QINT:.*]] = "tf.UniformQuantizedClipByValue"(%[[INPUT_QINT]], %[[MIN_QINT]], %[[MAX_QINT]]
   %q_output = "tf.UniformQuantizedClipByValue"(%arg0, %arg1, %arg1, %scale, %zp)
     {quantization_axis = -1 : i64, quantization_max_val = 2147483647 : i64, quantization_min_val = -2147483648 : i64} :
     (tensor<1x2x2x1x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x1x!tf_type.qint32>
 
-  // CHECK-DAG: %[[OUTPUT_1:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) {Truncate = false} : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
+  // CHECK-DAG: %[[OUTPUT_1:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) <{Truncate = false}> : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
   %output = "tf.Cast"(%q_output) {Truncate = false} : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
 
-  // CHECK-DAG: %[[OUTPUT_2:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) {Truncate = false} : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
-  // CHECK-DAG: %[[OUTPUT_QINT_1:.*]] = "tf.Cast"(%[[OUTPUT_1]]) {Truncate = false} : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
+  // CHECK-DAG: %[[OUTPUT_2:.*]] = "tf.Cast"(%[[OUTPUT_QINT]]) <{Truncate = false}> : (tensor<1x2x2x1x!tf_type.qint32>) -> tensor<1x2x2x1xi32>
+  // CHECK-DAG: %[[OUTPUT_QINT_1:.*]] = "tf.Cast"(%[[OUTPUT_1]]) <{Truncate = false}> : (tensor<1x2x2x1xi32>) -> tensor<1x2x2x1x!tf_type.qint32>
   // CHECK: "tf.UniformDequantize"(%[[OUTPUT_QINT_1:.*]]
   %dq = "tf.UniformDequantize"(%q_output, %scale, %zp) {
     quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir
index 3e588f5aeef1bc..f085a38f952c38 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir
@@ -1,13 +1,14 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-lift-quantizable-spots-as-functions | FileCheck %s
 
 // CHECK-LABEL: @conv_fn(
-// CHECK-SAME:          %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:          %[[ARG_1:.*]]: tensor<3x3x4x4xf32>)
-func.func @conv_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  func.return %0: tensor<1x3x3x4xf32>
+// CHECK-SAME:          %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  func.return %1: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -19,13 +20,14 @@ func.func @conv_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> te
 // -----
 
 // CHECK-LABEL: @dot_general_fn(
-// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                 %[[ARG_1:.*]]: tensor<167x64xf32>
-func.func @dot_general_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  return %0 : tensor<1x1x64xf32>
+// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -37,15 +39,17 @@ func.func @dot_general_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>)
 // -----
 
 // CHECK-LABEL: @conv_with_bias_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>,
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x3x3x4xf32>)
-func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>, %arg2: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %1 = stablehlo.add %0, %arg2 : tensor<1x3x3x4xf32>
-  func.return %1: tensor<1x3x3x4xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x3x3x4xf32>
+  func.return %3: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -58,15 +62,17 @@ func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf
 // -----
 
 // CHECK-LABEL: @dot_general_with_bias_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<167x64xf32>
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x1x64xf32>)
-func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>, %arg2: tensor<1x1x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %1 = stablehlo.add %0, %arg2 : tensor<1x1x64xf32>
-  func.return %1: tensor<1x1x64xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x1x64xf32>
+  func.return %3: tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -78,16 +84,71 @@ func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<16
 
 // -----
 
+// CHECK-LABEL: @conv_with_bias_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = shape.shape_of %2 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %5 = stablehlo.add %2, %4 : tensor<?x28x28x16xf32>
+  func.return %5: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_dynamic_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[ADD]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_bias_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<10xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %3 = shape.shape_of %2 : tensor<?x10xf32> -> tensor<2xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [1] : (tensor<10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %5 = stablehlo.add %2, %4 : tensor<?x10xf32>
+  func.return %5: tensor<?x10xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_dynamic_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: return %[[ADD]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
 // CHECK-LABEL: @conv_with_relu_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>)
-func.func @conv_with_relu_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
-  %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %2 = stablehlo.maximum %1, %0 : tensor<1x3x3x4xf32>
-  func.return %2: tensor<1x3x3x4xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x3x3x4xf32>
+  func.return %3: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -102,14 +163,15 @@ func.func @conv_with_relu_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf
 
 // CHECK-LABEL: @dot_general_with_relu_fn(
 // CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                 %[[ARG_1:.*]]: tensor<167x64xf32>
 func.func @dot_general_with_relu_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
-  %1 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %2 = stablehlo.maximum %1, %0 : tensor<1x1x64xf32>
-  return %2 : tensor<1x1x64xf32>
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x1x64xf32>
+  return %3 : tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -122,36 +184,95 @@ func.func @dot_general_with_relu_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<16
 
 // -----
 
+// CHECK-LABEL: @conv_with_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = shape.shape_of %2 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %5 = stablehlo.maximum %2, %4 : tensor<?x28x28x16xf32>
+  func.return %5: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_relu_dynamic_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_relu_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %3 = shape.shape_of %2 : tensor<?x10xf32> -> tensor<2xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [] : (tensor<f32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %5 = stablehlo.maximum %2, %4 : tensor<?x10xf32>
+  func.return %5: tensor<?x10xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_relu_dynamic_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[MAX]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
 // The pattern should not match when the const value for relu is not 0.
 
 // CHECK-LABEL: @conv_with_relu_wrong_const_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>)
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
 func.func @conv_with_relu_wrong_const_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
-  %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %2 = stablehlo.maximum %1, %0 : tensor<1x3x3x4xf32>
-  func.return %2: tensor<1x3x3x4xf32>
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x3x3x4xf32>
+  func.return %3: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
-// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]])
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[XLA_CALL_MODULE]], %[[CONST_1]]
+// CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_fn_1
+// CHECK-NOT: private @composite_conv_with_relu_fn_1
 
 // -----
 
 // CHECK-LABEL: @conv_with_relu6_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>)
-func.func @conv_with_relu6_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
-  %1 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
-  %2 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %3 = stablehlo.clamp %0, %2, %1 : tensor<1x3x3x4xf32>
-  func.return %3: tensor<1x3x3x4xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : tensor<1x3x3x4xf32>
+  func.return %4: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -166,16 +287,17 @@ func.func @conv_with_relu6_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4x
 // -----
 
 // CHECK-LABEL: @dot_general_with_relu6_fn(
-// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                 %[[ARG_1:.*]]: tensor<167x64xf32>
-func.func @dot_general_with_relu6_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
-  %1 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
-  %2 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %3 = stablehlo.clamp %0, %2, %1 : tensor<1x1x64xf32>
-  return %3 : tensor<1x1x64xf32>
+// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : tensor<1x1x64xf32>
+  return %4 : tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1)
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -190,17 +312,19 @@ func.func @dot_general_with_relu6_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<1
 // -----
 
 // CHECK-LABEL: @conv_with_bias_and_relu_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>,
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x3x3x4xf32>)
-func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>, %arg2: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
-  %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %2 = stablehlo.add %1, %arg2 : tensor<1x3x3x4xf32>
-  %3 = stablehlo.maximum %2, %0 : tensor<1x3x3x4xf32>
-  func.return %3: tensor<1x3x3x4xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.add %3, %1 : tensor<1x3x3x4xf32>
+  %5 = stablehlo.maximum %4, %2 : tensor<1x3x3x4xf32>
+  func.return %5: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -215,17 +339,19 @@ func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<
 // -----
 
 // CHECK-LABEL: @dot_general_with_bias_and_relu_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<167x64xf32>
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x1x64xf32>)
-func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>, %arg2: tensor<1x1x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
-  %1 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %2 = stablehlo.add %1, %arg2 : tensor<1x1x64xf32>
-  %3 = stablehlo.maximum %2, %0 : tensor<1x1x64xf32>
-  func.return %3: tensor<1x1x64xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.add %3, %1 : tensor<1x1x64xf32>
+  %5 = stablehlo.maximum %4, %2 : tensor<1x1x64xf32>
+  func.return %5: tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -239,19 +365,91 @@ func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>, %arg1:
 
 // -----
 
+// CHECK-LABEL: @conv_with_bias_and_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = shape.shape_of %3 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %6 = stablehlo.add %3, %5 : tensor<?x28x28x16xf32>
+  %7 = shape.shape_of %6 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %8 = stablehlo.dynamic_broadcast_in_dim %2, %7, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %9 = stablehlo.maximum %6, %8 : tensor<?x28x28x16xf32>
+  func.return %9: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_and_relu_dynamic_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
+// CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_and_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<10xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %4 = shape.shape_of %3 : tensor<?x10xf32> -> tensor<2xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [1] : (tensor<10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %6 = stablehlo.add %3, %5 : tensor<?x10xf32>
+  %7 = shape.shape_of %6 : tensor<?x10xf32> -> tensor<2xindex>
+  %8 = stablehlo.dynamic_broadcast_in_dim %2, %7, dims = [] : (tensor<f32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %9 = stablehlo.maximum %6, %8 : tensor<?x10xf32>
+  func.return %9: tensor<?x10xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_dynamic_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
+// CHECK: return %[[MAX]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
 // CHECK-LABEL: @conv_with_bias_and_relu6_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<3x3x4x4xf32>,
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x3x3x4xf32>)
-func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>, %arg2: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
-  %1 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
-  %2 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %3 = stablehlo.add %2, %arg2 : tensor<1x3x3x4xf32>
-  %4 = stablehlo.clamp %0, %3, %1 : tensor<1x3x3x4xf32>
-  func.return %4: tensor<1x3x3x4xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %5 = stablehlo.add %4, %1 : tensor<1x3x3x4xf32>
+  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x3x3x4xf32>
+  func.return %6: tensor<1x3x3x4xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -267,18 +465,20 @@ func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor
 // -----
 
 // CHECK-LABEL: @dot_general_with_bias_and_relu6_fn(
-// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>,
-// CHECK-SAME:                    %[[ARG_1:.*]]: tensor<167x64xf32>
-// CHECK-SAME:                    %[[ARG_2:.*]]: tensor<1x1x64xf32>)
-func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>, %arg2: tensor<1x1x64xf32>) -> tensor<1x1x64xf32> {
-  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
-  %1 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
-  %2 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %3 = stablehlo.add %2, %arg2 : tensor<1x1x64xf32>
-  %4 = stablehlo.clamp %0, %3, %1 : tensor<1x1x64xf32>
-  func.return %4: tensor<1x1x64xf32>
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.add %4, %1 : tensor<1x1x64xf32>
+  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x1x64xf32>
+  func.return %6: tensor<1x1x64xf32>
 }
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %arg1, %arg2)
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
 // CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
 // CHECK: }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_quantize.mlir
new file mode 100644
index 00000000000000..ae2f57081e40f7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_quantize.mlir
@@ -0,0 +1,72 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-post-quantize | FileCheck %s
+
+// CHECK-LABEL: @remove_volatile_qdq
+func.func @remove_volatile_qdq() -> tensor<3x2xf32> {
+  // CHECK: %[[CST:.*]] = stablehlo.constant
+  // CHECK-NOT: "quantfork.qcast"
+  // CHECK-NOT: "quantfork.dcast"
+  // CHECK: return %[[CST]]
+  %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
+  %q = "quantfork.qcast"(%cst) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq = "quantfork.dcast"(%q) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  func.return %dq : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @remove_volatile_qdq_with_requantization
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+func.func @remove_volatile_qdq_with_requantization(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  // CHECK: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK: %[[Q2:.*]] = stablehlo.uniform_quantize %[[Q1]]
+  // CHECK: %[[ABS:.*]] = stablehlo.abs %[[Q2]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[ABS]]
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG0]], %[[DQ]]
+  // CHECK: return %[[ADD]]
+  %q1 = "quantfork.qcast"(%arg0) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantfork.qcast"(%q1) {volatile} : (tensor<3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq1 = "quantfork.dcast"(%q2) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  %abs = stablehlo.abs %q2 : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq2 = "quantfork.dcast"(%abs) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  %add = stablehlo.add %dq1, %dq2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+  func.return %add : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @quantize_constant
+// CHECK-SAME: %[[ARG0:.*]]: tensor<1x3xf32>
+func.func @quantize_constant(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32> {
+  // CHECK-DAG: %[[QCST:.*]] = stablehlo.constant() {value = dense<-78> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK-DAG: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK-NOT: "quantfork.qcast"
+  // CHECK: %[[DOT:.*]] = stablehlo.dot %[[Q1]], %[[QCST]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+  // CHECK: return %[[DQ]]
+  %cst = stablehlo.constant dense<-0.390246302> : tensor<3x2xf32>
+  %q1 = "quantfork.qcast"(%arg0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantfork.qcast"(%cst) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %dot = stablehlo.dot %q1, %q2 : (tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %dq = "quantfork.dcast"(%dot) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x2xf32>
+  func.return %dq : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @convert_quantfork_qdq_to_stablehlo_uniform_qdq
+// CHECK-SAME: %[[ARG0:.*]]: tensor<1x3xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<3x2xf32>
+func.func @convert_quantfork_qdq_to_stablehlo_uniform_qdq(%arg0: tensor<1x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<1x2xf32> {
+  // CHECK: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK-NOT: "quantfork.qcast"
+  // CHECK: %[[Q2:.*]] = stablehlo.uniform_quantize %[[ARG1]]
+  // CHECK-NOT: "quantfork.qcast"
+  // CHECK: %[[DOT:.*]] = stablehlo.dot %[[Q1]], %[[Q2]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+  // CHECK: return %[[DQ]]
+  %q1 = "quantfork.qcast"(%arg0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %dot = stablehlo.dot %q1, %q2 : (tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %dq = "quantfork.dcast"(%dot) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x2xf32>
+  func.return %dq : tensor<1x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
index 612b8d6a519959..8f38f889f28e33 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
@@ -3,6 +3,7 @@
 // -----
 
 // CHECK-LABEL: func @dot
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
 func.func @dot(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[cst:.*]] = stablehlo.constant
   // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[cst]])
@@ -10,7 +11,7 @@ func.func @dot(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
   %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
-  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%arg0)
+  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%[[ARG_0]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
   // CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
@@ -29,8 +30,9 @@ func.func @dot(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
 // -----
 
 // CHECK-LABEL: func @duplicate_stats
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<2x3xf32>) -> tensor<2x3xf32>
 func.func @duplicate_stats(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  // CHECK: %[[q1:.*]] = "quantfork.qcast"(%arg0)
+  // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[ARG_0]])
   // CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
   // CHECK: %[[q2:.*]] = "quantfork.qcast"(%[[dq1]])
   // CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
@@ -44,6 +46,7 @@ func.func @duplicate_stats(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // -----
 
 // CHECK-LABEL: func @dot_redundant_stats
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
 func.func @dot_redundant_stats(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[cst:.*]] = stablehlo.constant
   // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[cst]])
@@ -51,7 +54,7 @@ func.func @dot_redundant_stats(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
   %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
-  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%arg0)
+  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%[[ARG_0]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
   // CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
@@ -87,10 +90,11 @@ func.func @convert_same_scale_propagate(%arg0: tensor<2x3xf32>) -> tensor<2x3xf3
 // -----
 
 // CHECK-LABEL: func @merge_consecutive_qcast
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<*xf32>, %[[ARG_1:.*]]: tensor<*xf32>, %[[ARG_2:.*]]: tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
 func.func @merge_consecutive_qcast(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
-  // CHECK: "quantfork.qcast"(%arg1)
+  // CHECK: "quantfork.qcast"(%[[ARG_1]])
   // CHECK-SAME: -> tensor<*x!quant.uniform<i8:f32, 0.02454993117089365:-64>>
-  // CHECK: "quantfork.qcast"(%arg1)
+  // CHECK: "quantfork.qcast"(%[[ARG_1]])
   // CHECK-SAME: -> tensor<*x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
   %0 = "quantfork.stats"(%arg0) {layerStats = dense<[-0.83811146, 2.4960899]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
   %1 = "quantfork.stats"(%arg1) {layerStats = dense<[-0.835039615, 1.000000e+00]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir
index 0ddb01cd0e0be4..ca467d1180ea0a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir
@@ -1,6 +1,7 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize=bit-width=4 -verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @dot_int4
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
 func.func @dot_int4(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[cst:.*]] = stablehlo.constant
   // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[cst]])
@@ -8,7 +9,7 @@ func.func @dot_int4(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
   // CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
   // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
   %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
-  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%arg0)
+  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%[[ARG_0]])
   // CHECK-SAME: quant.uniform<i4:f32, 0.13329366842905679:-1>
   // CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
   // CHECK-SAME: quant.uniform<i4:f32, 0.13329366842905679:-1>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
index eccd931d1b5008..d1bfea7a236448 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
@@ -20,7 +20,7 @@ func.func private @quantize_simple_xla_call_module(%arg0: tensor<1x4xf32>) -> te
 // CHECK: %[[CONST_0:.*]] = "stablehlo.constant"() {value = dense<1.000000e+00> : tensor<4x3xf32>} : () -> tensor<4x3xf32>
 // CHECK-DAG: %[[QCAST_0:.*]] = "quantfork.qcast"(%[[CONST_0]]) {volatile} : (tensor<4x3xf32>) -> tensor<4x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
 // CHECK-DAG: %[[QCAST_1:.*]] = "quantfork.qcast"(%[[ARG_0]]) {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-// CHECK: %[[XLACALLMODULE_0:.*]] = "tf.XlaCallModule"(%[[QCAST_1]], %[[QCAST_0]]) {{{.*}}} : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+// CHECK: %[[XLACALLMODULE_0:.*]] = "tf.XlaCallModule"(%[[QCAST_1]], %[[QCAST_0]]) <{{{.*}}}> {{{.*}}} : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
 // CHECK: %[[DCAST_0:.*]] = "quantfork.dcast"(%[[XLACALLMODULE_0]]) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
 // CHECK: "func.return"(%[[DCAST_0]]) : (tensor<1x3xf32>) -> ()
 
@@ -37,6 +37,6 @@ func.func private @quantize_simple_xla_call_module_no_operand() -> tensor<1x3xf3
 // Tests that the output of the tf.XlaCallModule op has been replaced by
 // a quantized type, and the corresponding quantfork.qcast ops that turned
 // the float output to a quantized type is removed.
-// CHECK: %[[XLACALLMODULE_0:.*]] = "tf.XlaCallModule"() {{{.*}}} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+// CHECK: %[[XLACALLMODULE_0:.*]] = "tf.XlaCallModule"() <{{{.*}}}> {{{.*}}} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
 // CHECK: %[[DCAST_0:.*]] = "quantfork.dcast"(%[[XLACALLMODULE_0]]) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
 // CHECK: "func.return"(%[[DCAST_0]]) : (tensor<1x3xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
new file mode 100644
index 00000000000000..97ea1f30be81ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
@@ -0,0 +1,114 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions | FileCheck %s
+
+module attributes {tf_saved_model.semantics} {
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
+  func.func private @quantize_dot_general(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK-LABEL: func.func private @quantize_dot_general
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3x3xi8>} : () -> tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.*]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.*]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// Checks that the entry function is quantized for dot_general. Quantized
+// dot_general outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_2:.*]]: tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL_0:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]], contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.*]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests error when there are no corresponding entry function to quantize
+// (@composite_dot_general_fn).
+
+module attributes {tf_saved_model.semantics} {
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
+  func.func private @error_when_no_entry_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// expected-error @+2 {{Failed to find a valid entry function}}
+// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantized_without_stats(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
+// not quantized.
+
+// CHECK-LABEL: func.func private @not_quantized_without_stats
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<3.000000e-01> : tensor<3x3xf32>
+// CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[ARG_1]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[XLA_CALL_MODULE_0]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// Check that the composite_dot_general_fn is untouched.
+
+// CHECK: func.func private @composite_dot_general_fn(%[[ARG_2:.*]]: tensor<1x3xf32>, %[[ARG_3:.*]]: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]]
+// CHECK: return %[[DOT_GENERAL]]
+}
+
+// -----
+
+// Tests that a fusion pattern for dot_general is not yet supported. Further op
+// coverage will be provided in the future.
+// TODO - b/307620428: Increase op coverage to cover this test case.
+
+module attributes {tf_saved_model.semantics} {
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
+  func.func private @dot_general_fn_fusion_not_quantized(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
index 57423f5f03a7ec..3d04c72dec7f7e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -30,13 +30,13 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %9 : tensor<1x64xf32>
   }
 
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() {Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], _entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() {Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], _entry_function = @_stablehlo_main_0
-  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-  // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1"
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_3]] : tensor<1x64xf32>
   // CHECK: }
@@ -60,6 +60,39 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 }
 
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1654 : i32}, tf_saved_model.semantics} {
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<0.134728625> : tensor<1x3xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<-1.280000e+02> : tensor<1x1024xf32>
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<0.003921567> : tensor<1x1024xf32>
+  // CHECK: %[[DIVIDE:.*]] = stablehlo.divide %arg0, %[[CONSTANT_2]]
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[DIVIDE]], %[[CONSTANT_1]]
+  // CHECK return %[[ADD]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x1024xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<0.134728625> : tensor<1x3xf32>
+    %1 = stablehlo.constant dense<-1.280000e+02> : tensor<1x1024xf32>
+    %2 = stablehlo.constant dense<0.003921567> : tensor<1x1024xf32>
+    %3 = stablehlo.divide %arg0, %2 : tensor<1x1024xf32>
+    %4 = stablehlo.add %3, %1 : tensor<1x1024xf32>
+    %5 = "tf.Identity"(%4) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    return %5 : tensor<1x1024xf32>
+  }
+
+ // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>]
+ // CHECK-SAME: _entry_function = @_stablehlo_main_0
+ // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP]])
+ // CHECK: return %[[IDENTITY]]
+ // CHECK }
+
+}
+
 // -----
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
@@ -77,9 +110,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %3 : tensor<1x3xf32>
   }
 
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1024x3>], _entry_function = @_stablehlo_main_
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
   // CHECK: }
@@ -109,8 +142,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 
   // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
   // CHECK: }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
index 197d5dd237e58d..bfd9de9ca60d25 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
@@ -37,6 +37,17 @@ UniformQuantizedType CreateI8F32UniformQuantizedType(const Location loc,
       /*storageTypeMin=*/llvm::minIntN(8), /*storageTypeMax=*/llvm::maxIntN(8));
 }
 
+UniformQuantizedType CreateI32F32UniformQuantizedType(
+    const Location loc, MLIRContext& context, const float scale,
+    const int32_t zero_point) {
+  return UniformQuantizedType::getChecked(
+      loc, /*flags=*/QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/32),
+      /*expressedType=*/FloatType::getF32(&context), scale, zero_point,
+      /*storageTypeMin=*/llvm::minIntN(32),
+      /*storageTypeMax=*/llvm::maxIntN(32));
+}
+
 UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
     const Location loc, MLIRContext& context, const ArrayRef<float> scales,
     const ArrayRef<int8_t> zero_points, const int quantization_dimension) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
index 84f5ae274e047a..68774b2ecb876b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
@@ -35,6 +35,16 @@ UniformQuantizedType CreateI8F32UniformQuantizedType(Location loc,
                                                      float scale,
                                                      int8_t zero_point);
 
+// Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
+// values. The produced type has f32 as its expressed type and i32 as its
+// storage type. The available values use the full range of the storage value.
+// Assumes asymmetric quantization, meaning the zero point values can be
+// non-zero values.
+UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
+                                                      MLIRContext& context,
+                                                      float scale,
+                                                      int32_t zero_point);
+
 // Creates a `UniformQuantizedPerAxisType` with the given `scales` and
 // `zero_points` values. The produced type has f32 as its expressed type and
 // i8 as its storage type. The available values use the full range of the
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
index eb40dcb6afb617..0888bfa8d22908 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
 
 #include <cstdint>
+#include <limits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -81,6 +82,60 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, HasScaleAndZeroPointProperlySet) {
   EXPECT_EQ(quantized_type.getZeroPoint(), 99);
 }
 
+class CreateI32F32UniformQuantizedTypeTest : public ::testing::Test {
+ protected:
+  CreateI32F32UniformQuantizedTypeTest() : ctx_() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+};
+
+TEST_F(CreateI32F32UniformQuantizedTypeTest, HasI32StorageType) {
+  const UniformQuantizedType quantized_type =
+      CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
+                                       /*scale=*/1.0, /*zero_point=*/0);
+
+  EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(32));
+}
+
+TEST_F(CreateI32F32UniformQuantizedTypeTest, HasF32ExpressedType) {
+  const UniformQuantizedType quantized_type =
+      CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
+                                       /*scale=*/1.0, /*zero_point=*/0);
+
+  EXPECT_TRUE(quantized_type.getExpressedType().isF32());
+}
+
+TEST_F(CreateI32F32UniformQuantizedTypeTest, IsSigned) {
+  const UniformQuantizedType quantized_type =
+      CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
+                                       /*scale=*/1.0, /*zero_point=*/0);
+
+  EXPECT_TRUE(quantized_type.isSigned());
+}
+
+TEST_F(CreateI32F32UniformQuantizedTypeTest,
+       SotrageTypeMinMaxEqualToI32MinMax) {
+  const UniformQuantizedType quantized_type =
+      CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
+                                       /*scale=*/1.0, /*zero_point=*/0);
+
+  EXPECT_EQ(quantized_type.getStorageTypeMin(),
+            std::numeric_limits<int32_t>::min());
+  EXPECT_EQ(quantized_type.getStorageTypeMax(),
+            std::numeric_limits<int32_t>::max());
+}
+
+TEST_F(CreateI32F32UniformQuantizedTypeTest, HasScaleAndZeroPointProperlySet) {
+  const UniformQuantizedType quantized_type =
+      CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
+                                       /*scale=*/8.0, /*zero_point=*/1111);
+
+  EXPECT_EQ(quantized_type.getScale(), 8.0);
+  EXPECT_EQ(quantized_type.getZeroPoint(), 1111);
+}
+
 class CreateI8F32UniformQuantizedPerAxisTypeTest : public ::testing::Test {
  protected:
   CreateI8F32UniformQuantizedPerAxisTypeTest() : ctx_() {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index aec612c95b7b62..286e0e8799121f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -117,6 +117,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "id_assigner",
+    hdrs = ["id_assigner.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+    ],
+)
+
 pytype_strict_library(
     name = "calibration_algorithm",
     srcs = ["calibration_algorithm.py"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/id_assigner.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/id_assigner.h
new file mode 100644
index 00000000000000..ae75d9f579b090
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/id_assigner.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_ID_ASSIGNER_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_ID_ASSIGNER_H_
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+
+namespace tensorflow::quantization {
+
+// An interface that assigns UUIDs to CustomAggregator ops.
+class CustomAggregatorIdAssigner {
+ public:
+  virtual ~CustomAggregatorIdAssigner() = default;
+
+  // Assigns UUIDs to each CustomAggregator op found in each GraphDef in
+  // `exported_model`. The UUIDs are set to the `id` attributes. The UUIDs will
+  // be used during calibration step to identify the collected quantization
+  // statistics for each CustsomAggregator op.
+  virtual ExportedModel AssignIds(
+      const ExportedModel& exported_model) const = 0;
+};
+
+}  // namespace tensorflow::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_ID_ASSIGNER_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index d415e755e9c6f6..22f95fa5369215 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -45,8 +45,8 @@ bool IsOpWithQuantizableTrait(Operation* op) {
   // Supported quantizable ops.
   return isa<TF::XlaConvV2Op, TF::XlaDotV2Op, TF::MatMulOp, TF::Conv2DOp,
              TF::GatherOp, TF::GatherV2Op, TF::XlaGatherOp,
-             TF::DepthwiseConv2dNativeOp, TF::Conv3DOp, TF::BatchMatMulV2Op,
-             TF::EinsumOp>(op);
+             TF::ResourceGatherOp, TF::DepthwiseConv2dNativeOp, TF::Conv3DOp,
+             TF::BatchMatMulV2Op, TF::EinsumOp>(op);
 }
 
 bool IsOpWithInt8TypeOperand(Operation* op) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index d6ad4605863116..be72660f10091c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -144,7 +144,7 @@ class QuantizeCompositeFunctionsPass
                      "drq", "Post-training dynamic-range quantizaiton"),
           clEnumValN(tensorflow::quantization::QuantizationMethod::
                          METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8,
-                     "weight_only", "Post-training weight-only quantizaiton"))};
+                     "weight_only", "Post-training weight-only quantization"))};
 
   Option<OpSet> target_opset_{
       *this, "target-opset", llvm::cl::init(OpSet::TF),
@@ -1283,9 +1283,7 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
       ctx, target_opset_);
   patterns_2.add<QuantizeConstPattern>(ctx, target_opset_);
 
-  if (target_opset_ == OpSet::XLA && enable_per_channel_quantization_ &&
-      quantization_method_ == tensorflow::quantization::QuantizationMethod::
-                                  METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8) {
+  if (target_opset_ == OpSet::XLA && enable_per_channel_quantization_) {
     patterns_2.add<RestoreWeightShapePattern>(ctx);
   }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 5b1815fc4fd8f8..ac1b64848b5bc1 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -108,8 +108,8 @@ tf_python_pybind_extension(
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:id_assigner",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -178,7 +178,6 @@ pytype_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/lib/io:file_io",
-        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index d7156be21fb843..c96e3a776c79af 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -1548,7 +1548,7 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'dilations': [1, 2, 2, 1],
       },
       {
-          'testcase_name': 'with_bias_and_relu6_to_xla',
+          'testcase_name': 'with_bias_and_relu6_to_xla_per_tensor',
           'activation_fn': nn_ops.relu6,
           'has_bias': True,
           'has_batch_norm': False,
@@ -1556,6 +1556,15 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'dilation_with_bias_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -1566,6 +1575,16 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'enable_per_channel_quantization': False,
           'dilations': [1, 2, 2, 1],
       },
+      {
+          'testcase_name': 'dilation_with_bias_and_relu6_to_xla_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -1575,6 +1594,15 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'dilation_with_bias_and_bn_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -1585,6 +1613,18 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'enable_per_channel_quantization': False,
           'dilations': [1, 2, 2, 1],
       },
+      {
+          'testcase_name': (
+              'dilation_with_bias_and_bn_and_relu6_to_xla_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1594,6 +1634,15 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_dynamic_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'dilation_with_bias_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1604,6 +1653,18 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'enable_per_channel_quantization': False,
           'dilations': [1, 2, 2, 1],
       },
+      {
+          'testcase_name': (
+              'dilation_with_bias_and_relu6_to_xla_dynamic_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1613,6 +1674,17 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': (
+              'with_bias_and_bn_and_relu6_to_xla_dynamic_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'dilation_with_bias_and_bn_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -1623,6 +1695,18 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'enable_per_channel_quantization': False,
           'dilations': [1, 2, 2, 1],
       },
+      {
+          'testcase_name': (
+              'dilation_with_bias_and_bn_and_relu6_to_xla_dynamic_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+          'dilations': [1, 2, 2, 1],
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_uq',
           'activation_fn': nn_ops.relu6,
@@ -1787,6 +1871,28 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
 
     if target_opset == quant_opts_pb2.XLA:
       self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+      if enable_per_channel_quantization:
+        per_channel_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[
+                    tensor_shape_pb2.TensorShapeProto(
+                        dim=[
+                            tensor_shape_pb2.TensorShapeProto.Dim(
+                                size=filter_shape[-1]
+                            )
+                        ]
+                    )
+                ]
+            )
+        )
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'Const',
+                '_output_shapes',
+                per_channel_size_attr,
+            )
+        )
     elif target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertTrue(
           self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
@@ -2050,6 +2156,15 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla',
           'activation_fn': nn_ops.relu6,
@@ -2059,6 +2174,15 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': False,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -2068,6 +2192,15 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_dynamic_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_dynamic',
           'activation_fn': nn_ops.relu6,
@@ -2077,6 +2210,17 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'input_shape_dynamic': True,
           'enable_per_channel_quantization': False,
       },
+      {
+          'testcase_name': (
+              'with_bias_and_bn_and_relu6_to_xla_dynamic_per_channel'
+          ),
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': True,
+      },
       {
           'testcase_name': 'with_bias_and_relu6_to_uq',
           'activation_fn': nn_ops.relu6,
@@ -2172,6 +2316,28 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
       self.assertTrue(
           self._contains_op(output_graphdef, 'DepthwiseConv2dNative')
       )
+      if enable_per_channel_quantization:
+        per_channel_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[
+                    tensor_shape_pb2.TensorShapeProto(
+                        dim=[
+                            tensor_shape_pb2.TensorShapeProto.Dim(
+                                size=filter_shape[-1] * filter_shape[2]
+                            )
+                        ]
+                    )
+                ]
+            )
+        )
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'Const',
+                '_output_shapes',
+                per_channel_size_attr,
+            )
+        )
     elif target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertTrue(
           self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
@@ -2337,11 +2503,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     else:
       self.assertAllClose(new_outputs, expected_outputs, atol=0.13)
 
-  # NOTE: Isolated the most basic configuration from `test_matmul_ptq_model`
-  # for StableHLO PTQ prototype testing while integrating. Please note this
-  # test is for intermediate testing purposes as the migration is not complete.
-  # TODO: b/298581932 - Add the full test case for STABLEHLO opset once
-  # migration is complete.
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_ptq_model_stablehlo(self):
     activation_fn = None
@@ -2353,7 +2514,7 @@ def test_matmul_ptq_model_stablehlo(self):
     input_shape = (*lhs_batch_size, 1, 1024)
     filter_shape = (*rhs_batch_size, 1024, 3)
     static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
-    self._create_matmul_model(
+    model = self._create_matmul_model(
         input_shape,
         filter_shape,
         self._input_saved_model_path,
@@ -2362,38 +2523,47 @@ def test_matmul_ptq_model_stablehlo(self):
     )
     rng = np.random.default_rng(seed=1234)
 
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+    expected_outputs = model.matmul(input_data)
+
     def data_gen() -> repr_dataset.RepresentativeDataset:
-      for _ in range(5):
+      for _ in range(100):
         yield {
             'input_tensor': rng.uniform(
                 low=0.0, high=1.0, size=static_input_shape
             ).astype(np.float32)
         }
 
-    tags = {tag_constants.SERVING}
-
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         ),
-        tags=tags,
+        tags={tag_constants.SERVING},
         signature_keys=['serving_default'],
         op_set=target_opset,
     )
-    # TODO: b/299545836 - Remove exception handling below after migrating
-    # StableHLO export passes.
-    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
-        Exception,
-        "Failed to convert MLIR to GraphDef. op node 'quantfork.stats' was not"
-        ' a TF op',
-    ):
-      converted_model = quantize_model.quantize(
-          self._input_saved_model_path,
-          self._output_saved_model_path,
-          quantization_options,
-          representative_dataset=data_gen(),
-      )
-      self.assertIsNotNone(converted_model)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    new_outputs = converted_model.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol value is
+    # arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.02)
 
   @parameterized.named_parameters(
       {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 97d36d4984f3e3..c08aaf78376f8b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/id_assigner.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -43,6 +44,7 @@ namespace {
 
 using ::tensorflow::calibrator::CalibrationStatistics;
 using ::tensorflow::calibrator::CalibratorSingleton;
+using ::tensorflow::quantization::CustomAggregatorIdAssigner;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::QuantizationOptions;
 using ::tensorflow::quantization::QuantizePtqDynamicRange;
@@ -86,8 +88,8 @@ CalibrationStatistics GetStatisticsFromCalibrator(const absl::string_view id) {
 namespace pybind11 {
 namespace detail {
 
-// Converts `ExportedModel` (c++) to `bytes` (python). The resulting `bytes`
-// object is a serialization of `ExportedModel`.
+// Handles `ExportedModel` (c++) <-> `bytes` (python) conversion. The `bytes`
+// object in the python layer is a serialization of `ExportedModel`.
 //
 // See https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html for
 // further details on how custom type conversions work for pybind11.
@@ -96,6 +98,21 @@ struct type_caster<ExportedModel> {
  public:
   PYBIND11_TYPE_CASTER(ExportedModel, const_name("ExportedModel"));
 
+  // Loads an `ExportedModel` instance from a python `bytes` object (`src`).
+  bool load(handle src, const bool convert) {
+    auto caster = make_caster<absl::string_view>();
+    // Make sure the user passed a valid python string.
+    if (!caster.load(src, convert)) {
+      return false;
+    }
+
+    const absl::string_view exported_model_serialized =
+        cast_op<absl::string_view>(std::move(caster));
+
+    // NOLINTNEXTLINE: Explicit std::string conversion required for OSS.
+    return value.ParseFromString(std::string(exported_model_serialized));
+  }
+
   // Constructs a `bytes` object after serializing `src`.
   static handle cast(ExportedModel&& src, return_value_policy policy,
                      handle parent) {
@@ -103,6 +120,14 @@ struct type_caster<ExportedModel> {
     // destruction of py::bytes and returns a raw python object handle.
     return py::bytes(Serialize(src)).release();
   }
+
+  // Constructs a `bytes` object after serializing `src`.
+  static handle cast(const ExportedModel& src, return_value_policy policy,
+                     handle parent) {
+    // release() prevents the reference count from decreasing upon the
+    // destruction of py::bytes and returns a raw python object handle.
+    return py::bytes(Serialize(src)).release();
+  }
 };
 
 // Python -> cpp conversion for `QuantizationOptions`. Accepts a serialized
@@ -130,10 +155,30 @@ struct type_caster<QuantizationOptions> {
 }  // namespace detail
 }  // namespace pybind11
 
+namespace {
+
+// A "trampoline" class that redirects virtual function calls to the python
+// implementation.
+//
+// Reference:
+// https://pybind11.readthedocs.io/en/stable/advanced/classes.html#overriding-virtual-functions-in-python
+class CustomAggregatorIdAssignerTrampoline : public CustomAggregatorIdAssigner {
+ public:
+  using CustomAggregatorIdAssigner::CustomAggregatorIdAssigner;
+
+  ExportedModel AssignIds(const ExportedModel& exported_model) const override {
+    PYBIND11_OVERRIDE_PURE(ExportedModel, CustomAggregatorIdAssigner,
+                           assign_ids, exported_model);
+  }
+};
+
+}  // namespace
+
 PYBIND11_MODULE(pywrap_quantize_model, m) {
   // Supports absl::StatusOr<T> type conversions.
   pybind11::google::ImportStatusModule();
   pybind11_protobuf::ImportNativeProtoCasters();
+
   // Calibrator related functions.
   m.def(
       "clear_calibrator",
@@ -156,6 +201,14 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
       Returns the proto CalibrationStatistics given id from calibrator.
     )pbdoc");
 
+  // Exports `CustomAggregatorIdAssigner` class. A pure virtual member function
+  // `AssignIds` is mapped to `assign_ids` in python, which is expected to be
+  // inherited and overridden.
+  py::class_<CustomAggregatorIdAssigner, CustomAggregatorIdAssignerTrampoline>(
+      m, "CustomAggregatorIdAssigner")
+      .def(py::init<>())
+      .def("assign_ids", &CustomAggregatorIdAssigner::AssignIds);
+
   // Quantization functions.
   m.def(
       "quantize_qat_model",
@@ -218,11 +271,17 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
          const std::vector<std::string>& signature_keys,
          const std::unordered_set<std::string>& tags,
          const QuantizationOptions& quant_opts,
-         const absl::flat_hash_map<std::string, std::string>& function_aliases)
+         const absl::flat_hash_map<std::string, std::string>& function_aliases,
+         const CustomAggregatorIdAssigner& custom_aggregator_id_assigner)
           -> absl::StatusOr<ExportedModel> {
-        return QuantizePtqModelPreCalibration(saved_model_path, signature_keys,
-                                              tags, quant_opts,
-                                              function_aliases);
+        const absl::StatusOr<ExportedModel> exported_model =
+            QuantizePtqModelPreCalibration(saved_model_path, signature_keys,
+                                           tags, quant_opts, function_aliases);
+        if (!exported_model.ok()) {
+          return exported_model.status();
+        }
+
+        return custom_aggregator_id_assigner.AssignIds(*exported_model);
       },
       R"pbdoc(
       Returns serialized ExportedModel that contains the model's GraphDef and
@@ -230,6 +289,10 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
       user should pass a serialized `QuantizationOptions` for the `quant_opts`
       argument.
 
+      The argument `custom_aggregator_id_assigner` is an instance of
+      `CustomAggregatorIdAssigner` whose virtual function `assign_ids` is
+      implemented in python.
+
       Raises `StatusNotOk` exception if when the run was unsuccessful.
     )pbdoc");
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 43dbd283a67fdf..b83461ba0ed542 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -740,6 +740,38 @@ def _get_saver_def_or_none(
   return None
 
 
+class CustomAggregatorIdAssigner(
+    pywrap_quantize_model.CustomAggregatorIdAssigner
+):
+  """Python impl. of `pywrap_quantize_model.CustomAggregatorIdAssigner`.
+
+  The interface is defined in the C++ layer, exposing a pure virtual function
+  `assign_ids`.
+  """
+
+  def assign_ids(self, exported_model_serialized: bytes) -> bytes:
+    """Assigns UUIDs to each CustomAggregator op find in the graph def.
+
+    Args:
+      exported_model_serialized: Serialized `ExportedModel` instance.
+
+    Returns:
+      Serialized `ExportedModel` whose CustomAggregator ops are assigned UUIDs
+      to their `id` attributes.
+    """
+    exported_model = exported_model_pb2.ExportedModel.FromString(
+        exported_model_serialized
+    )
+
+    graph_def = exported_model.graph_def
+    for function_def in graph_def.library.function:
+      for node_def in function_def.node_def:
+        if node_def.op == 'CustomAggregator':
+          node_def.attr['id'].s = uuid.uuid4().hex.encode('ascii')
+
+    return exported_model.SerializeToString()
+
+
 def _run_static_range_ptq(
     src_saved_model_path: str,
     dst_saved_model_path: str,
@@ -780,6 +812,7 @@ def _run_static_range_ptq(
           set(quant_opts.tags),
           quant_opts.SerializeToString(),
           dict(function_aliases),
+          CustomAggregatorIdAssigner(),
       )
   )
 
@@ -788,11 +821,6 @@ def _run_static_range_ptq(
   )
 
   graph_def = exported_model.graph_def
-  for function_def in graph_def.library.function:
-    for node_def in function_def.node_def:
-      if node_def.op == 'CustomAggregator':
-        node_def.attr['id'].s = uuid.uuid4().hex.encode('ascii')
-
   pre_calib_output_model_path = tempfile.mkdtemp()
   save_model.save_model_v1(
       graph_def,
@@ -1376,7 +1404,7 @@ def _populate_quantization_options_default_values(
       quantization_options.min_num_elements_for_weights = (
           _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
       )
-      logging.warn(
+      logging.warning(
           (
               'QuantizationOptions.min_num_elements_for_weights is not set (0).'
               ' Setting to the default value: %d.'
@@ -1384,15 +1412,23 @@ def _populate_quantization_options_default_values(
           _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS,
       )
 
-  # TODO(b/281595329): Implement static range quantization per-channel support
+  # TODO: b/307900054 - Set the per-channel quantization by default.
   if quantization_options.enable_per_channel_quantization and not (
-      quantization_options.op_set == quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
-      or quantization_options.quantization_method.preset_method
-      == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
+      (
+          quantization_options.op_set == quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
+          or quantization_options.quantization_method.preset_method
+          == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
+      )
+      or (
+          quantization_options.op_set == quant_opts_pb2.OpSet.XLA
+          and quantization_options.quantization_method.preset_method
+          == _PresetMethod.METHOD_STATIC_RANGE_INT8
+      )
   ):
     raise ValueError(
-        'Currently, per-channel quantization is supported for Uniform '
-        'Quantized opset and Weight-only.'
+        'Currently, per-channel quantization is supported for Uniform Quantized'
+        ' opset, weight only quantization, or XLA opset with static range'
+        ' quantization.'
     )
 
   if (
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 01cf3401f9a37a..4384a13c8e05eb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -289,9 +289,10 @@ message QuantizationOptions {
   // If not set, it defaults to `true`.
   optional bool freeze_all_variables = 9;
 
-  // Enables chnanel-wise quantizaiton. By default, channel-wise quantization is
+  // Enables channel-wise quantization. By default, channel-wise quantization is
   // not applied regardless of the op support. Currently, it is supported for
-  // Uniform Quantized opset only.
+  // XLA opset for SRQ on weight tensors (not activation),
+  // and Uniform Quantized opset .
   bool enable_per_channel_quantization = 10;
 
   // Enables two inputs of an operation to be both tensors.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 6fb3c91b15e129..0b9cdc09ca5b93 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -52,7 +52,7 @@ void AddStablehloQuantToIntPasses(mlir::PassManager &pm) {
 void AddStaticRangeQuantizationPass(
     mlir::PassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
-  // TODO: b/299545840 - Include QuantizeCompositeFunctionsPass as in bug.
+  pm.addPass(mlir::quant::stablehlo::createQuantizeCompositeFunctionsPass());
 }
 
 void AddConvertTpuToCpuModelPasses(mlir::PassManager &pm) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
index 4ee9b746fff9f6..bd3546299c1967 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
@@ -23,21 +23,21 @@ module {
   }
 
 // WholeModel-LABEL: func @conv
-// WholeModel-DAG: %[[w:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
-// WholeModel-DAG: %[[b:.*]] = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00
-// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
-// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
+// WholeModel-DAG: %[[w:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
+// WholeModel-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
+// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}>
+// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // WholeModel-DAG: return %[[output0]], %[[output1]]
 
 // PerLayer-LABEL: func @conv
-// PerLayer-DAG: %[[w:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
-// PerLayer-DAG: %[[b:.*]] = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00
-// PerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
-// PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"} : (tensor<*xf32>) -> ()
+// PerLayer-DAG: %[[w:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
+// PerLayer-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
+// PerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}>
+// PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}>
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // PerLayer-DAG: return %[[output0]], %[[output1_quantized]]
 }
 
@@ -69,28 +69,28 @@ module {
   }
 
 // WholeModel-LABEL: func @multiple_conv2d
-// WholeModel-DAG: %[[b0:.*]] = "tf.Const"() {value = dense<0.000000e+00>
-// WholeModel-DAG: %[[b1:.*]] = "tf.Const"() {value = dense<1.000000e+00>
-// WholeModel-DAG: %[[w0:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
-// WholeModel-DAG: %[[w1:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
-// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// WholeModel-DAG: %[[b0:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
+// WholeModel-DAG: %[[b1:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
+// WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
+// WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
+// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // WholeModel-DAG: return %[[output1]]
 
 // PerLayer-LABEL: func @multiple_conv2d
-// PerLayer-DAG: %[[b0:.*]] = "tf.Const"() {value = dense<0.000000e+00>
-// PerLayer-DAG: %[[b1:.*]] = "tf.Const"() {value = dense<1.000000e+00>
-// PerLayer-DAG: %[[w0:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
-// PerLayer-DAG: %[[w1:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
-// PerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
-// PerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}
-// PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) {enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
-// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) {enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// PerLayer-DAG: %[[b0:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
+// PerLayer-DAG: %[[b1:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
+// PerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
+// PerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
+// PerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// PerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}>
+// PerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// PerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// PerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// PerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}>
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// PerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // PerLayer-DAG: return %[[output1_quantized]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
index 4fc6cbf3f97b7d..deaafb3e2e99a9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
@@ -10,8 +10,8 @@ func.func @cast_bf16_conv_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2
 }
 
 // CHECK: func @cast_bf16_conv_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
-// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) {Truncate = false} : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>}> {device = ""} : () -> tensor<2x3x3x2xbf16>
+// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) <{Truncate = false}> : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cast]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[conv]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
@@ -28,8 +28,8 @@ func.func @cast_bf16_conv_with_bias_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tens
 }
 
 // CHECK: func @cast_bf16_conv_with_bias_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
 // CHECK: %[[bias_add:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[bias_add]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
@@ -46,7 +46,7 @@ func.func @cast_bf16_avg_pool_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3
 }
 
 // CHECK: func @cast_bf16_avg_pool_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
 // CHECK: %[[avg_pool:.*]] = "tf.AvgPool"(%[[conv]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[avg_pool]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
@@ -62,7 +62,7 @@ func.func @cast_bf16_matmul_to_fp32(%arg0: tensor<1x10xf32>) -> (tensor<1x2xf32>
 }
 
 // CHECK: func @cast_bf16_matmul_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<10x2xf32>} : () -> tensor<10x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<10x2xf32>}> : () -> tensor<10x2xf32>
 // CHECK: %[[matmul:.*]] = "tf.MatMul"(%arg0, %[[cst]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[matmul]])
 // CHECK: return %[[identity]] : tensor<1x2xf32>
@@ -77,7 +77,7 @@ func.func @cast_bf16_depthwise_conv_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tens
 }
 
 // CHECK: func @cast_bf16_depthwise_conv_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
 // CHECK: %[[depthwise_conv:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[cst]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[depthwise_conv]]) {device = ""} : (tensor<1x2x2x6xf32>) -> tensor<1x2x2x6xf32>
 // CHECK: return %[[identity]] : tensor<1x2x2x6xf32>
@@ -92,7 +92,7 @@ func.func @cast_bf16_batch_matmul_v2_to_fp32(%arg0: tensor<1x1x10xf32>) -> (tens
 }
 
 // CHECK: func @cast_bf16_batch_matmul_v2_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<10x2xf32>} : () -> tensor<10x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<10x2xf32>}> : () -> tensor<10x2xf32>
 // CHECK: %[[batch_matmul:.*]] = "tf.BatchMatMulV2"(%arg0, %[[cst]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[batch_matmul]]) {device = ""} : (tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
 // CHECK: return %[[identity]] : tensor<1x1x2xf32>
@@ -108,7 +108,7 @@ func.func @cast_bf16_add_v2_to_fp32(%arg0: tensor<2xbf16>, %arg1: tensor<2xbf16>
 // CHECK: func @cast_bf16_add_v2_to_fp32(%[[ARG_0:.*]]: tensor<2xbf16>, %[[ARG_1:.*]]: tensor<2xbf16>) -> tensor<2xf32>
 
 // bfloat16 operands are cast to f32 operands.
-// CHECK-DAG: %[[CAST_0:.*]] = "tf.Cast"(%[[ARG_0]]) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
-// CHECK-DAG: %[[CAST_1:.*]] = "tf.Cast"(%[[ARG_1]]) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
+// CHECK-DAG: %[[CAST_0:.*]] = "tf.Cast"(%[[ARG_0]]) <{Truncate = false}> : (tensor<2xbf16>) -> tensor<2xf32>
+// CHECK-DAG: %[[CAST_1:.*]] = "tf.Cast"(%[[ARG_1]]) <{Truncate = false}> : (tensor<2xbf16>) -> tensor<2xf32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CAST_0]], %[[CAST_1]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
 // CHECK: return %[[ADD]] : tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tf_xla_op_to_tf_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tf_xla_op_to_tf_op.mlir
index d30c61f7df72dd..27a7bb62bbbd74 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tf_xla_op_to_tf_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tf_xla_op_to_tf_op.mlir
@@ -6,7 +6,7 @@ func.func @xla_dot_v2(%arg0: tensor<?x2x3xf32>, %arg1: tensor<3x4x5xf32>) -> (te
 }
 
 // CHECK: func @xla_dot_v2
-// CHECK: %[[einsum:.*]] = "tf.Einsum"(%arg0, %arg1) {equation = "abc,cde->abde"} : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
+// CHECK: %[[einsum:.*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "abc,cde->abde"}> : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
 // CHECK: return %[[einsum]] : tensor<?x2x4x5xf32>
 
 // -----
@@ -22,12 +22,12 @@ func.func @xla_gather(%arg0: tensor<?x2xf32>, %arg1: tensor<1xi32>, %arg2: tenso
 }
 
 // CHECK: func @xla_gather
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1x1xi64>} : () -> tensor<1x1xi64>
-// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1x1xi64>}> : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
 // CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) : (tensor<2xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<2xi64>
-// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<2xi32>) -> tensor<2xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<2xi32>) -> tensor<2xi64>
 // CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<?x2xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<*xf32>
 // CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<*xf32>, tensor<1xi64>) -> tensor<*xf32>
 // CHECK: return %[[reshape]] : tensor<*xf32>
@@ -47,12 +47,12 @@ func.func @xla_gather_known_output_shape(%arg0: tensor<5xi32>, %arg1: tensor<1xi
 }
 
 // CHECK: func @xla_gather_known_output_shape
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<0> : tensor<1x1xi64>} : () -> tensor<1x1xi64>
-// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
-// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<1x1xi64>}> : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}> : () -> tensor<0xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
 // CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) : (tensor<1xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
-// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
 // CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<5xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi32>
 // CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<1xi32>, tensor<0xi64>) -> tensor<i32>
 // CHECK: return %[[reshape]] : tensor<i32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
index 26809cd7e8d94f..ad13ebbfd59814 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
@@ -26,8 +26,8 @@ func.func private @tpu_func_0_optim0(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x2
 }
 
 // CHECK: func @tpu_conv(%[[ARG0:.*]]: tensor<1x3x4x3xf32>)
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
-// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) {Truncate = false} : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>}> {device = ""} : () -> tensor<2x3x3x2xbf16>
+// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) <{Truncate = false}> : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[ARG0]], %[[cast]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[conv]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_flow.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_flow.mlir
index b08c49f94f11ba..aaddb72fccd63d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_flow.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_flow.mlir
@@ -13,18 +13,18 @@ func.func @fake_quant_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32
 // CHECK-LABEL: @fake_quant_conv
 // CHECK-SAME: %[[ARG0:.*]]: tensor<1x3
 // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3
-// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<0.0117647061> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() {value = dense<1.38408304E-5> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CST_3:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CST_4:.*]] = "tf.Const"() {value = dense<0.0027450982> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[CST_5:.*]] = "tf.Const"() {value = dense<-19> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CST_6:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-NEXT: %[[V0:.*]] = "tf.PartitionedCall"(%[[ARG1]], %[[CST_1]], %[[CST_0]]) {config = "", config_proto = "", executor_type = "", f = @quantize_i8} : (tensor<2x3x3x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x3x3x2xi8>
-// CHECK-NEXT: %[[V1:.*]] = "tf.PartitionedCall"(%[[ARG0]], %[[CST]], %[[CST_0]]) {config = "", config_proto = "", executor_type = "", f = @quantize_i8} : (tensor<1x3x4x3xf32>, tensor<f32>, tensor<i32>) -> tensor<1x3x4x3xi8>
-// CHECK-NEXT: %[[V2:.*]] = "tf.PartitionedCall"(%[[V1]], %[[V0]], %[[CST_6]], %[[CST]], %[[CST_0]], %[[CST_1]], %[[CST_0]], %[[CST_2]], %[[CST_3]], %[[CST_4]], %[[CST_5]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_fn_0} : (tensor<1x3x4x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
-// CHECK-NEXT: %[[V3:.*]] = "tf.PartitionedCall"(%[[V2]], %[[CST_4]], %[[CST_5]]) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8} : (tensor<*xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<0.00117647066> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<-43> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{value = dense<0.0117647061> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() <{value = dense<1.38408304E-5> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CST_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CST_4:.*]] = "tf.Const"() <{value = dense<0.0027450982> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[CST_5:.*]] = "tf.Const"() <{value = dense<-19> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CST_6:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-NEXT: %[[V0:.*]] = "tf.PartitionedCall"(%[[ARG1]], %[[CST_1]], %[[CST_0]]) <{config = "", config_proto = "", executor_type = "", f = @quantize_i8}> : (tensor<2x3x3x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x3x3x2xi8>
+// CHECK-NEXT: %[[V1:.*]] = "tf.PartitionedCall"(%[[ARG0]], %[[CST]], %[[CST_0]]) <{config = "", config_proto = "", executor_type = "", f = @quantize_i8}> : (tensor<1x3x4x3xf32>, tensor<f32>, tensor<i32>) -> tensor<1x3x4x3xi8>
+// CHECK-NEXT: %[[V2:.*]] = "tf.PartitionedCall"(%[[V1]], %[[V0]], %[[CST_6]], %[[CST]], %[[CST_0]], %[[CST_1]], %[[CST_0]], %[[CST_2]], %[[CST_3]], %[[CST_4]], %[[CST_5]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_fn_0}> : (tensor<1x3x4x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
+// CHECK-NEXT: %[[V3:.*]] = "tf.PartitionedCall"(%[[V2]], %[[CST_4]], %[[CST_5]]) <{config = "", config_proto = "", executor_type = "", f = @dequantize_i8}> : (tensor<*xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[V3]] : tensor<*xf32>
 
 // CHECK: func private @quantize_i8(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
index 5d9801e208593c..e5c5d8a4211d61 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
@@ -25,7 +25,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK: %[[pad:.*]] = "tf.PadV2"(%[[quant]]
 // CHECK: %[[xlaconv:.*]] = "tf.XlaConvV2"(%[[pad]]
 // CHECK: %[[sub:.*]] = "tf.Sub"(%[[xlaconv]]
-// CHECK: %[[cast:.*]] = "tf.Cast"(%[[sub]]) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[cast:.*]] = "tf.Cast"(%[[sub]]) <{Truncate = false}> : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
 // CHECK: %[[dequant1:.*]] = "tf.Mul"(%[[cast]]
 // CHECK: %[[relu:.*]] = "tf.Relu"(%[[dequant1]]
 // CHECK: %[[clamped:.*]] = "tf.Minimum"(%[[relu]]
@@ -35,12 +35,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK: %[[maximum2:.*]] = "tf.Maximum"(%[[add2]]
 // CHECK: %[[minimum2:.*]] = "tf.Minimum"(%[[maximum2]]
 // CHECK: %[[round2:.*]] = "tf.Round"(%[[minimum2]]
-// CHECK: %[[quant2:.*]] = "tf.Cast"(%[[round2]]) {Truncate = false} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xi8>
+// CHECK: %[[quant2:.*]] = "tf.Cast"(%[[round2]]) <{Truncate = false}> : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xi8>
 
 // CHECK: %[[pad2:.*]] = "tf.PadV2"(%[[quant2]]
 // CHECK: %[[xlaconv2:.*]] = "tf.XlaConvV2"(%[[pad2]]
 // CHECK: %[[sub2:.*]] = "tf.Sub"(%[[xlaconv2]]
-// CHECK: %[[cast2:.*]] = "tf.Cast"(%[[sub2]]) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[cast2:.*]] = "tf.Cast"(%[[sub2]]) <{Truncate = false}> : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
 // CHECK: %[[rescale2:.*]] = "tf.Mul"(%[[cast2]]
 // CHECK: %[[rescale2_maxclamped:.*]] = "tf.Maximum"(%[[rescale2]]
 // CHECK: %[[rescale2_minclamped:.*]] = "tf.Minimum"(%[[rescale2_maxclamped]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
index dd1c5a41bdf4d1..fa747357169f55 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
@@ -26,10 +26,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_MIN_MAX)
 // MIN-MAX-CHECK: func @add_custom_ops
-// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 1 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 1 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // MIN-MAX-CHECK: func @no_custom_ops_on_non_f32_type
@@ -44,10 +44,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_AVERAGE_MIN_MAX)
 // AVERAGE-MIN-MAX-CHECK: func @add_custom_ops
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 2 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 2 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 2 : i32, id = "", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // AVERAGE-MIN-MAX-CHECK: func @no_custom_ops_on_non_f32_type
@@ -65,10 +65,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256, min_percentile=0.001, max_percentile=99.999)
 // )
 // HISTOGRAM-PERCENTILE-CHECK: func @add_custom_ops
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 3 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 3 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 3 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-PERCENTILE-CHECK: func @no_custom_ops_on_non_f32_type
@@ -86,10 +86,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @add_custom_ops
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 4 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 4 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 4 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_custom_ops_on_non_f32_type
@@ -107,10 +107,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @add_custom_ops
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 5 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 5 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 5 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_custom_ops_on_non_f32_type
@@ -128,10 +128,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @add_custom_ops
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {calibration_method = 6 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {calibration_method = 6 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {calibration_method = 6 : i32, id = "", initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_custom_ops_on_non_f32_type
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
index 56208364ad8b3d..be98dd179bae11 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
@@ -25,7 +25,7 @@ module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantic
     func.return %1 : tensor<1xf32>
   }
 // CHECK: func private @mul2(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> attributes {tf.entry_function = {inputs = "mul2_y:0,mul2_x:0", outputs = "PartitionedCall_1:0"}} {
-// CHECK:   %[[CONST_0:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:   %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK:   %[[MUL_1:.*]] = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK:   %[[MUL_2:.*]] = "tf.Mul"(%[[MUL_1]], %[[CONST_0]]) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
 // CHECK:   return %[[MUL_2]] : tensor<1xf32>
@@ -33,8 +33,8 @@ module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantic
 
 // CHECK: func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}, %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall_1:0"]}) attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "PartitionedCall:0,PartitionedCall_1:0"}, tf_saved_model.exported_names = ["main"]} {
 // CHECK-NOT: f = @NoOp
-// CHECK:   %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @mul1} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK:   %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg2, %arg3) {config = "", config_proto = "", executor_type = "", f = @mul2} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @mul1}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg2, %arg3) <{config = "", config_proto = "", executor_type = "", f = @mul2}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK-DAG:   %[[IDENTITY_0:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_0]])
 // CHECK-DAG:   %[[IDENTITY_1:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_1]])
 // CHECK:   return %[[IDENTITY_0]], %[[IDENTITY_1]] : tensor<1xf32>, tensor<1xf32>
@@ -82,10 +82,10 @@ module attributes {tf.versions = {producer = 1132 : i32}, tf_saved_model.semanti
 // CHECK-SAME: tf_saved_model.exported_names = ["main"]
 
 // Check that the function call to @add exists and not to @NoOp.
-// CHECK: %[[CALL0:.*]] = "tf.PartitionedCall"(%[[ARG0]], %[[ARG1]]) {
+// CHECK: %[[CALL0:.*]] = "tf.PartitionedCall"(%[[ARG0]], %[[ARG1]]) <{
 // CHECK-NOT: f = @NoOp
 // CHECK-SAME: f = @add
-// CHECK-SAME: }
+// CHECK-SAME: }>
 // CHECK-SAME: : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[CALL0]])
 // CHECK: return %[[IDENTITY]] : tensor<1xf32>
@@ -111,7 +111,7 @@ module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantic
 // CHECK: func.func @main(%arg0: tensor<16xf32> {tf_saved_model.index_path = ["input:0"]}, %arg1: tensor<i32> {tf_saved_model.index_path = ["k:0"]})
 // CHECK-SAME: -> (tensor<?xf32> {tf_saved_model.index_path = ["TopK:0"]}, tensor<?xi32> {tf_saved_model.index_path = ["TopK:1"]})
 // CHECK-SAME: attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}, tf_saved_model.exported_names = ["main"]}
-// CHECK: %[[CALL0:.*]]:2 = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @topk}
+// CHECK: %[[CALL0:.*]]:2 = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @topk}>
 // Expects an IdentityN op to be created.
 // CHECK: %[[IDENTITY:.*]]:2 = "tf.IdentityN"(%[[CALL0]]#0, %[[CALL0]]#1) : (tensor<?xf32>, tensor<?xi32>) -> (tensor<?xf32>, tensor<?xi32>)
 // CHECK: return %[[IDENTITY]]#0, %[[IDENTITY]]#1 : tensor<?xf32>, tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir
index 385052ab9e5cb5..7f73eee6f056ca 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir
@@ -32,10 +32,10 @@ module attributes {tf_saved_model.semantics} {
 
 // Test that RestoreV2 op is created with 1 resulting value.
 // CHECK: %[[RESTORE:.*]] = "tf.RestoreV2"(%[[ARG_0]], %[[CST_1]], %[[CST_2]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]]) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]]) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
 
 // Test that the loc is properly set to it's shared_name.
-// CHECK-LOC: "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK-LOC-SAME: loc("var_0")
 }
 
@@ -66,19 +66,19 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
 
 // Test that RestoreV2 op is created with 2 resulting values.
 // CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<4xi32>)
 
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) {validate_shape = false} : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
 
 // Test that the locs are properly set to their shared_names.
-// CHECK-LOC: "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK-LOC-SAME: loc("var_0")
-// CHECK-LOC: "tf.VarHandleOp"() {{{.*shared_name = "var_1".*}}}
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
 // CHECK-LOC-SAME: loc("var_1")
 }
 
@@ -101,11 +101,11 @@ module attributes {tf_saved_model.semantics} {
 // Check that no function argument is created.
 // CHECK: func.func @init_func_init_op()
 
-// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
-// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}}
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}> : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{{{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}}>
 // Make sure that "tf.RestoreV2" is not created.
 // CHECK-NOT: "tf.RestoreV2"
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST]]) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST]]) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
 
 // CHECK-LOC: @init_func_init_op
 // CHECK-LOC: return
@@ -140,19 +140,19 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
 
 // Test that RestoreV2 op is created with 2 resulting values.
 // CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<2xf32>)
 
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
-// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
 
 // Test that the locs are properly set to their shared_names.
-// CHECK-LOC: "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK-LOC-SAME: loc("var_0")
-// CHECK-LOC: "tf.VarHandleOp"() {{{.*shared_name = "var_1".*}}}
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
 // CHECK-LOC-SAME: loc("var_1")
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_save_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_save_op.mlir
index 9483331b6fe31d..c142247cdbc235 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_save_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_save_op.mlir
@@ -30,8 +30,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"()
 // CHECK-SAME: {{.*shared_name = "var_0".*}}
 // CHECK: %[[READ_VARIABLE:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {{{.*value = dense<"var_0"> : tensor<1x!tf_type.string>.*}}}
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<1x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type.string>.*}}}>
 // CHECK: "tf.SaveV2"(%[[ARG]], %[[CONST_0]], %[[CONST_1]], %[[READ_VARIABLE]])
 // CHECK: return
 }
@@ -73,8 +73,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[READ_VARIABLE_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_0]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
 // CHECK-DAG: %[[READ_VARIABLE_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_1]]) : (tensor<!tf_type.resource<tensor<3xf32>>>) -> tensor<3xf32>
 
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
 // CHECK: "tf.SaveV2"(%[[ARG]], %[[CONST_0]], %[[CONST_1]], %[[READ_VARIABLE_0]], %[[READ_VARIABLE_1]])
 // CHECK: return
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
index 03fcbf4a2b0623..4aa1ae76b8a83d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
@@ -10,8 +10,8 @@ func.func @issue_ids(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32
 
 
 // CHECK: func @issue_ids
-// CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) {id = "0"} : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) {id = "1"} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = "0"}> : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = "1"}> : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) {id = "2"} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = "2"}> : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:  return [[res]] : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
index f61a9fbe9feb6f..6a7f9da6bc5563 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
@@ -26,10 +26,10 @@ func.func @float_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) ->
   %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
   func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1}
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
 // CHECK-SAME: f = @composite_conv2d_with_bias_and_relu_fn_1}
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
@@ -39,8 +39,8 @@ func.func @float_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) ->
 
 // CHECK-LABEL: private @composite_conv2d_with_bias_and_relu6_fn_1
 // CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
-// CHECK-SAME: data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
 // CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
 // CHECK-NEXT: return %[[RELU6_0]]
@@ -70,15 +70,15 @@ func.func @float_conv_strides_equals_to_dilations(%arg0: tensor<1x3x4x3xf32>, %a
 }
 
 // CHECK-LABEL: func @float_conv_strides_equals_to_dilations(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> {
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK: return %[[PARTITIONEDCALL_0]] : tensor<*xf32>
 // CHECK: }
 
 // CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
 // CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
-// CHECK-SAME: data_format = "NHWC", device = "", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
 // CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
 // CHECK-NEXT: return %[[RELU6_0]]
@@ -111,14 +111,14 @@ func.func @float_depthwise_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x
   %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
   func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: _tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu6_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu_fn_1
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 // CHECK: }
 
@@ -161,14 +161,14 @@ func.func @float_matmul(
   %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
   func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>}
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<10xf32>}>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_matmul_with_bias_and_relu6_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: f = @composite_matmul_with_bias_and_relu_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_bias_and_relu_fn_1
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
-// CHECK-SAME: f = @composite_matmul_with_bias_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_bias_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 // CHECK: }
 
@@ -207,10 +207,10 @@ func.func @float_matmul_with_reshape(%arg0: tensor<1x10xf32>, %arg1: tensor<10x1
   func.return %3 : tensor<*xf32>
 
 
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>}
-// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi32>}
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<10xf32>}>
+// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[-1, 10]> : tensor<2xi32>}>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]], %[[SHAPE]])
-// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]]
 // CHECK: }
 
@@ -247,14 +247,14 @@ func.func @float_conv_no_bias(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2x
   func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv2d_with_relu6_fn_1}
+// CHECK-SAME: f = @composite_conv2d_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_conv2d_with_relu_fn_1}
+// CHECK-SAME: f = @composite_conv2d_with_relu_fn_1
 
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_conv2d_fn_1}
+// CHECK-SAME: f = @composite_conv2d_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 // CHECK: }
 
@@ -288,12 +288,12 @@ func.func @float_depthwise_conv_no_bias(%arg0: tensor<1x3x4x3xf32>, %arg1: tenso
   func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu6_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu_fn_1
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 // CHECK: }
 
@@ -323,12 +323,12 @@ func.func @float_matmul_no_bias(
   func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_matmul_with_relu6_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_matmul_with_relu_fn_1}
+// CHECK-SAME: f = @composite_matmul_with_relu_fn_1
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
-// CHECK-SAME: f = @composite_matmul_fn_1}
+// CHECK-SAME: f = @composite_matmul_fn_1
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 // CHECK: }
 
@@ -361,14 +361,14 @@ func.func @conv3d_no_bias(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf32
 // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
 
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv3d_with_relu_fn_1}
+// CHECK-SAME: f = @composite_conv3d_with_relu_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// CHECK-SAME: f = @composite_conv3d_with_relu6_fn_1}
+// CHECK-SAME: f = @composite_conv3d_with_relu6_fn_1
 
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// CHECK-SAME: f = @composite_conv3d_fn_1}
+// CHECK-SAME: f = @composite_conv3d_fn_1
 
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 
@@ -406,14 +406,14 @@ func.func @conv3d_with_bias(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf
 // CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<2xf32>
 
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu_fn_1}
+// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
-// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu6_fn_1}
+// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu6_fn_1
 
 // CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
-// CHECK-SAME: f = @composite_conv3d_with_bias_fn_1}
+// CHECK-SAME: f = @composite_conv3d_with_bias_fn_1
 
 // CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
index 35215eca54068e..305c6345566ab5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
@@ -12,10 +12,10 @@ func.func @lift_float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>,
   } : (tensor<1x12x12x512xf32>, tensor<1x12x12x512xf32>) -> tensor<*xf32>
   func.return %out_1, %out_2 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<512x512xf32>}> : () -> tensor<512x512xf32>
 // CHECK: %[[PARTITIONEDCALL:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_matmul_fn_1}
+// CHECK-SAME: f = @composite_matmul_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[UNQUANTIZED_OUTPUT:.*]] = "tf.MatMul"(%arg0, %arg0)
 // CHECK: }
 
@@ -45,23 +45,23 @@ func.func @lift_float_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor
 
   func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv2d_fn_2}
+// CHECK-SAME: f = @composite_conv2d_fn_2}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
 // CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
-// CHECK-SAME: f = @composite_conv2d_fn_1}
+// CHECK-SAME: f = @composite_conv2d_fn_1
 // CHECK: %[[BIASADD_1:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
 // CHECK: return %[[RELU6_0]], %[[BIASADD_1]]
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv2d_fn_2
 // CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
-// CHECK-SAME: data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
 // CHECK-NEXT: return %[[CONV2D_0]]
 
 // CHECK-LABEL: private @composite_conv2d_fn_1
@@ -90,7 +90,7 @@ func.func @not_lift_float_conv_with_non_constant_weights(%arg0: tensor<1x3x4x3xf
 
   func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK-NOT: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
 // CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
 }
@@ -115,15 +115,15 @@ func.func @lift_float_depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf3
   %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
   func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
-// CHECK-SAME: _tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_depthwise_conv2d_fn_2}
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_2}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
 // CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
 // CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
-// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1}
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1
 // CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
 // CHECK: return %[[RELU6_0]], %[[BIASADD_0]]
 // CHECK: }
@@ -153,8 +153,8 @@ func.func @lift_float_conv3d(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2x
 
 // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_conv3d_fn_1}
+// CHECK-SAME: f = @composite_conv3d_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[RELU:.*]] = "tf.Relu"(%[[PARTITIONEDCALL_0]])
 // CHECK: return %[[RELU]]
 
@@ -162,8 +162,8 @@ func.func @lift_float_conv3d(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2x
 
 // WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
 // WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// WEIGHTONLY: {_tfl_quant_trait = "fully_quantizable",
-// WEIGHTONLY-SAME: f = @composite_conv3d_fn_1}
+// WEIGHTONLY-SAME: f = @composite_conv3d_fn_1}>
+// WEIGHTONLY: {_tfl_quant_trait = "fully_quantizable"
 // WEIGHTONLY: %[[RELU:.*]] = "tf.Relu"(%[[PARTITIONEDCALL_0]])
 // WEIGHTONLY: return %[[RELU]]
 
@@ -181,16 +181,16 @@ func.func @lift_float_batch_matmul(%arg0: tensor<4x4x3xf32>) -> (tensor<4x4x3xf3
 
 // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<4x3x3xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_batch_matmul_fn_1}
+// CHECK-SAME: f = @composite_batch_matmul_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: return %[[PARTITIONEDCALL_0]]
 
 // CHECK-LABEL: private @composite_batch_matmul_fn_1
 
 // WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<4x3x3xf32>
 // WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
-// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable",
-// WEIGHTONLY-SAME: f = @composite_batch_matmul_fn_1}
+// WEIGHTONLY-SAME: f = @composite_batch_matmul_fn_1}>
+// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable"
 // WEIGHTONLY: return %[[PARTITIONEDCALL_0]]
 
 // WEIGHTONLY-LABEL: private @composite_batch_matmul_fn_1
@@ -209,15 +209,15 @@ func.func @lift_float_gather(%arg0: tensor<6xi64>) -> (tensor<6x32xf32>) {
 // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<i32>
 // CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<128x32xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%[[CST_1]], %arg0, %[[CST]])
-// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: f = @composite_gather_fn_1}
+// CHECK-SAME: f = @composite_gather_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: return %[[PARTITIONEDCALL_0]]
 
 // WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<i32>
 // WEIGHTONLY-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<128x32xf32>
 // WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%[[CST_1]], %arg0, %[[CST]])
-// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable",
-// WEIGHTONLY-SAME: f = @composite_gather_fn_1}
+// WEIGHTONLY-SAME: f = @composite_gather_fn_1}>
+// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable"
 // WEIGHTONLY: return %[[PARTITIONEDCALL_0]]
 
 // WEIGHTONLY-LABEL: private @composite_gather_fn_1
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
index b1104490025c6e..83d6b618c44489 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
@@ -11,11 +11,11 @@ func.func @lift_float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>,
   } : (tensor<1x12x12x512xf32>, tensor<1x12x12x512xf32>) -> tensor<*xf32>
   func.return %out_1, %out_2 : tensor<*xf32>, tensor<*xf32>
 
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<512x512xf32>}> : () -> tensor<512x512xf32>
 // CHECK: %[[PARTITIONEDCALL:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST]])
-// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: {config = "",
-// CHECK-SAME: f = @composite_matmul_fn_1}
+// CHECK-SAME: <{config = "",
+// CHECK-SAME: f = @composite_matmul_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
 // CHECK: %[[UNQUANTIZED_OUTPUT:.*]] = "tf.MatMul"(%arg0, %arg0)
 // CHECK: }
 
@@ -33,9 +33,9 @@ func.func @not_lift_float_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>)
   } : (tensor<1x3x4x512xf32>, tensor<2x3x512x512xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x512x512xf32>} : () -> tensor<2x3x512x512xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<2x3x512x512xf32>}> : () -> tensor<2x3x512x512xf32>
 // CHECK: %[[PARTITIONEDCALL:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST]])
-// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable",
-// CHECK-SAME: {config = "",
-// CHECK-SAME: f = @composite_conv2d_fn_1}
+// CHECK-SAME: <{config = "",
+// CHECK-SAME: f = @composite_conv2d_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
index 1d80a199d2e3ba..38911e2b70eb32 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
@@ -14,15 +14,15 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @depthwise_conv
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu6_fn_1
 // Check that the `_tfl_quant_trait` attribute has been removed.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu6_fn_1
 
 // CHECK-LABEL: private @composite_depthwise_conv2d_with_bias_and_relu6_fn_1
 // CHECK: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1]}>
 // Check that the `attr_map` attribute has been removed.
 // CHECK-NOT: attr_map
-// CHECK-SAME: {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1]}
 
 // -----
 
@@ -35,17 +35,17 @@ func.func @conv_with_non_constant_filter(%arg0: tensor<1x3x4x3xf32>, %arg1: tens
 }
 
 // CHECK-LABEL: func @conv_with_non_constant_filter
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
 // Check that the `_tfl_quant_trait` attribute has been removed.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
 
 // CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1
 // CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
 // Check that the `attr_map` attribute has been removed.
 // CHECK-NOT: attr_map
-// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
 
 // -----
 
@@ -59,18 +59,18 @@ func.func @conv_with_dynamic_channel_dim(%arg0: tensor<1x3x4x?xf32>) -> tensor<*
 }
 
 // CHECK-LABEL: func @conv_with_dynamic_channel_dim
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x1xf32>
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]], %[[CONST_0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
 // Check that the `_tfl_quant_trait` attribute has been removed.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
 
 // CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1
 // CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
 // Check that the `attr_map` attribute has been removed.
 // CHECK-NOT: attr_map
-// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
 
 // -----
 
@@ -93,11 +93,11 @@ func.func @const_filter_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2
 
 // CHECK-LABEL: func @const_filter_with_q_dq
 // CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() {device = "", value = dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>}
+// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() <{value = dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>}> {device = ""}
 // CHECK: %[[Q_W:.*]] = "quantfork.qcast"(%[[WEIGHT]])
 // CHECK: %[[DQ_W:.*]] = "quantfork.dcast"(%[[Q_W]])
 // CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"({{.*}}, %[[DQ_W]], %[[BIAS]])
-// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
 // CHECK-SAME: f = @composite_conv2d_with_bias_and_relu_fn_1
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
 
-// CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu_fn_1
\ No newline at end of file
+// CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu_fn_1
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir
index 8dfd2815692fc6..a2a86a2c112664 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla_selective_quantization.mlir
@@ -16,9 +16,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @conv2d_unmatching_unit
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_fn_1
 // Check that the `_tfl_quant_trait` attribute exists since the unit is not in `unit_wise_quantization_specs`.
 // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_fn_1
 // CHECK-SAME: loc(callsite("Model/conv2d@conv2d_unmatching_unit"("Conv2D") at "QuantizationUnit({{.*}})"))
 
 // -----
@@ -36,9 +36,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @conv2d_disable_quantization
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_fn_1
 // Check that quantization is disabled for this unit.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_fn_1
 // CHECK-SAME: loc(callsite("test_opt_out@conv2d_disable_quantization"("Conv2D") at "QuantizationUnit({{.*}})"))
 
 // -----
@@ -58,9 +58,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @conv2d_with_bias_disable_quantization
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_fn_1
 // Check that quantization is disabled for this unit.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_with_bias_fn_1
 // CHECK-SAME: loc(callsite("test_opt_out@conv2d_with_bias_disable_quantization"("Conv2D") at "QuantizationUnit({{.*}})"))
 
 // -----
@@ -80,9 +80,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @matmul_with_reshape_disable_quantization
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1
 // Check that quantization is disabled for this unit.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1
 // CHECK-SAME: loc(callsite("test_opt_out@matmul_with_reshape_disable_quantization"("MatMul") at "QuantizationUnit({{.*}})"))
 
 // -----
@@ -105,8 +105,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @serving_default
 // CHECK: "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_fn_1
 // Check that quantization is disabled for this unit.
 // CHECK-NOT: _tfl_quant_trait = "fully_quantizable"
-// CHECK-SAME: f = @composite_conv2d_fn_1
 // CHECK-SAME: loc(callsite("test_opt_out@conv2d_with_inliner"("Conv2D") at "QuantizationUnit({{.*}})"))
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
index 0ca4745b730aa3..1b0e3e286f1f8d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
@@ -146,14 +146,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-NEXT: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.PartitionedCall"(%[[ARG]])
 // CHECK-SAME: f = @serving_default
 // Checks that the contents of @NoOp are copied here.
-// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"test_1">.*}}}
-// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<1>.*}}}
+// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"test_1">.*}}}>
+// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<1>.*}}}>
 
 // CHECK-NEXT: %[[OUT_2:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.HashTableV2"()
 // CHECK-NEXT: %[[CTL_3:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_2]], %[[OUT_0]], %[[OUT_1]])
 
-// CHECK-DAG: %[[OUT_3:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"test_2">.*}}}
-// CHECK-DAG: %[[OUT_4:.*]], %[[CTL_5:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<2>.*}}}
+// CHECK-DAG: %[[OUT_3:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"test_2">.*}}}>
+// CHECK-DAG: %[[OUT_4:.*]], %[[CTL_5:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<2>.*}}}>
 
 // CHECK-NEXT: %[[OUT_5:.*]], %[[CTL_6:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.HashTableV2"()
 // CHECK-NEXT: %[[CTL_7:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_5]], %[[OUT_3]], %[[OUT_4]])
@@ -330,9 +330,9 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-NEXT: tf_executor.graph
 
 // Checks that the ops from @init_func_restore_op are cloned.
-// CHECK-DAG: %[[CONST_0:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}
-// CHECK-DAG: %[[CONST_1:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}
-// CHECK: %[[VAR_HANDLE:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK-DAG: %[[CONST_0:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK: %[[VAR_HANDLE:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK: %[[RESTORE:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.RestoreV2"(%[[ARG]], %[[CONST_1]], %[[CONST_0]])
 // CHECK: %[[CTL_3:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]])
 // CHECK: %[[CTL_4:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.NoOp"()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_save_function_ops_to_main.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_save_function_ops_to_main.mlir
index 5341e15c798161..bc0b283b808831 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_save_function_ops_to_main.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_save_function_ops_to_main.mlir
@@ -32,10 +32,10 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: func.func @main
 // CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
 // CHECK: tf_executor.graph
-// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK: %[[READ_VARIABLE:.*]], {{.*}} = tf_executor.island wraps "tf.ReadVariableOp"(%[[VAR_HANDLE]])
-// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}
-// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() {{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}
+// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
 // CHECK: %[[CTL_0:.*]] = tf_executor.island wraps "tf.SaveV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]], %[[READ_VARIABLE]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
 
 // Test that the Identity op has been created to fetch the file prefix
@@ -150,10 +150,10 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
 // CHECK-SAME: tf.entry_function = {inputs = "__tf_file_prefix:0", outputs = ""}
 // CHECK: tf_executor.graph
-// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
 // CHECK: %[[READ_VARIABLE:.*]], {{.*}} = tf_executor.island wraps "tf.ReadVariableOp"(%[[VAR_HANDLE]])
-// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}
-// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() {{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}
+// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
 // CHECK: %[[CTL_0:.*]] = tf_executor.island wraps "tf.SaveV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]], %[[READ_VARIABLE]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
 
 // Test that the Identity op has been created to fetch the file prefix
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/optimize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/optimize.mlir
index 2a69627bb40c96..48a5c43a5bcaa2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/optimize.mlir
@@ -56,11 +56,11 @@ func.func @remove_redundant_cast(%arg0: tensor<1x100x100x1xf32>) -> (tensor<1x96
 
 // CHECK: %[[CLIPBYVALUE_0:.*]] = "tf.ClipByValue"
 // CHECK-SAME: (tensor<1x100x100x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x100x100x1xi32>
-// CHECK: %[[CAST_1:.*]] = "tf.Cast"(%[[CLIPBYVALUE_0]]) {Truncate = false} : (tensor<1x100x100x1xi32>) -> tensor<1x100x100x1xf32>
+// CHECK: %[[CAST_1:.*]] = "tf.Cast"(%[[CLIPBYVALUE_0]]) <{Truncate = false}> : (tensor<1x100x100x1xi32>) -> tensor<1x100x100x1xf32>
 
 // CHECK: %[[CLIPBYVALUE_1:.*]] = "tf.ClipByValue"
 // CHECK-SAME: (tensor<1x98x98x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x98x98x1xi32>
-// CHECK: %[[CAST_3:.*]] = "tf.Cast"(%[[CLIPBYVALUE_1]]) {Truncate = false} : (tensor<1x98x98x1xi32>) -> tensor<1x98x98x1xf32>
+// CHECK: %[[CAST_3:.*]] = "tf.Cast"(%[[CLIPBYVALUE_1]]) <{Truncate = false}> : (tensor<1x98x98x1xi32>) -> tensor<1x98x98x1xf32>
 
 // CHECK: %[[CLIPBYVALUE_2:.*]] = "tf.ClipByValue"
 // CHECK-SAME: (tensor<1x96x96x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x96x96x1xi32>
@@ -76,7 +76,7 @@ func.func @consecutive_add_add(%arg0: tensor<i32>) -> (tensor<i32>) {
 
 // CHECK-LABEL: func.func @consecutive_add_add
 
-// CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<-30> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<-30> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK: return %[[ADD]] : tensor<i32>
 }
@@ -90,7 +90,7 @@ func.func @consecutive_add_sub(%arg0: tensor<i32>) -> (tensor<i32>) {
 
 // CHECK-LABEL: func.func @consecutive_add_sub
 
-// CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK: return %[[SUB]] : tensor<i32>
 }
@@ -104,7 +104,7 @@ func.func @consecutive_sub_add(%arg0: tensor<i32>) -> (tensor<i32>) {
 
 // CHECK-LABEL: func.func @consecutive_sub_add
 
-// CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK: return %[[ADD]] : tensor<i32>
 }
@@ -118,7 +118,7 @@ func.func @consecutive_sub_sub(%arg0: tensor<i32>) -> (tensor<i32>) {
 
 // CHECK-LABEL: func.func @consecutive_sub_sub
 
-// CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<-30> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<-30> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK: return %[[SUB]] : tensor<i32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
index c99fed3d43a6f6..1e771e2586a61e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
@@ -7,8 +7,8 @@ func.func @decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   func.return %add : tensor<*xf32>
 }
 // CHECK: func @decompose_batch_norm
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.49743462E-5> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.999950051> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.49743462E-5> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.999950051> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[mul:.*]] = "tf.Mul"(%arg0, %[[CONST_0]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK: %[[add:.*]] = "tf.AddV2"(%[[mul]], %[[CONST]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[add]] : tensor<*xf32>
@@ -22,9 +22,9 @@ func.func @not_decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   func.return %bn : tensor<*xf32>
 }
 // CHECK: func @not_decompose_batch_norm
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK: %[[bn:.*]], %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %[[CONST]], %[[CONST_0]], %[[CONST_0]], %[[CONST]]) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = true} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[bn:.*]], %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %[[CONST]], %[[CONST_0]], %[[CONST_0]], %[[CONST]]) <{data_format = "NHWC", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = true}> {device = ""} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
 // CHECK-NEXT: return %[[bn]] : tensor<*xf32>
 
 // -----
@@ -37,10 +37,10 @@ func.func @convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2
   func.return %1 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @convert_add_to_biasadd
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
 // -----
@@ -53,9 +53,9 @@ func.func @not_convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3
   func.return %1 : tensor<1x3x2x3xf32>
 }
 // CHECK: func @not_convert_add_to_biasadd
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x3xf32>} : () -> tensor<2x3x3x3xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<1x3x2x3xf32>} : () -> tensor<1x3x2x3xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x3xf32>}> : () -> tensor<2x3x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<1x3x2x3xf32>}> : () -> tensor<1x3x2x3xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
 // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x3xf32>
 
@@ -69,8 +69,8 @@ func.func @fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf3
   func.return %1 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @fuse_conv2d_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[CONV2D]] : tensor<1x3x2x2xf32>
 
 // -----
@@ -83,9 +83,9 @@ func.func @not_fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x
   func.return %1 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @not_fuse_conv2d_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: %[[ADD:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x2xf32>, tensor<2x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
 
@@ -101,10 +101,10 @@ func.func @fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<
   func.return %2 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @fuse_conv2d_with_bias_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
 // -----
@@ -119,11 +119,11 @@ func.func @not_fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (ten
   func.return %1, %2 : tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>
 }
 // CHECK: func @not_fuse_conv2d_with_bias_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: %[[MUL:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_1]]) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]], %[[MUL]] : tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>
 
@@ -139,10 +139,10 @@ func.func @fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>) -> (tensor<
   func.return %2 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @fuse_conv2d_with_bias_and_add
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
 // -----
@@ -156,10 +156,10 @@ func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1:
   func.return %2 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @not_fuse_conv2d_with_bias_and_add
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[BIASADD]], %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
 
@@ -173,10 +173,10 @@ func.func @match_depthwise_conv2d_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32
   func.return %1 : tensor<*xf32>
 }
 // CHECK: func @match_depthwise_conv2d_and_add
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
 
 // -----
@@ -189,8 +189,8 @@ func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?
   func.return %1 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[DEPTHWISE_CONV2D]] : tensor<?x?x?x3xf32>
 
 // -----
@@ -205,10 +205,10 @@ func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (te
   func.return %2 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_with_bias_and_add
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
 // -----
@@ -223,10 +223,10 @@ func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (te
   func.return %2 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_with_bias_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
 // -----
@@ -236,7 +236,7 @@ func.func @lower_einsum(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> t
   func.return %0 : tensor<3x4x6xf32>
 }
 // CHECK-LABEL: lower_einsum
-// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
 
 // -----
 
@@ -251,7 +251,7 @@ func.func @removing_identity_after_const(%arg0: tensor<*xf32>) -> (tensor<*xf32>
   func.return %2 : tensor<*xf32>
 }
 // CHECK: func @removing_identity_after_const
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
 // CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
 
 // -----
@@ -291,14 +291,14 @@ func.func @batch_norm_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf
 }
 
 // CHECK: func @batch_norm_with_q_dq
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0.707036077> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<-0.914072155> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0.707036077> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<-0.914072155> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[q_input:.*]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
 // CHECK: %[[dq_input:.*]] = "quantfork.dcast"(%[[q_input]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
 // CHECK: %[[q_weight:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>
 // CHECK: %[[dq_weight:.*]] = "quantfork.dcast"(%[[q_weight]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>) -> tensor<2x3x3x2xf32>
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[dq_input]], %[[dq_weight]])
-// CHECK: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]]) {data_format = "NHWC"}
+// CHECK: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]]) <{data_format = "NHWC"}>
 // CHECK: %[[relu6:.*]] = "tf.Relu6"(%[[bias]])
 
 // -----
@@ -334,8 +334,8 @@ func.func @conv2d_with_large_weight_and_mul(%arg0: tensor<?x?x?x3xf32>) -> (tens
   func.return %2 : tensor<?x?x?x256xf32>
 }
 // CHECK: func @conv2d_with_large_weight_and_mul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.250000e+00> : tensor<48x48x3x256xf32>} : () -> tensor<48x48x3x256xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<256xf32>} : () -> tensor<256xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.250000e+00> : tensor<48x48x3x256xf32>}> : () -> tensor<48x48x3x256xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<256xf32>}> : () -> tensor<256xf32>
 // CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]])
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]])
 // CHECK-NEXT: return %[[BIASADD]]
@@ -354,8 +354,8 @@ func.func @depthwise_conv2d_with_large_weight_and_add(%arg0: tensor<*xf32>) -> (
   func.return %2 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @depthwise_conv2d_with_large_weight_and_add
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.500000e+00> : tensor<48x48x3x256xf32>} : () -> tensor<48x48x3x256xf32>
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.500000e+00> : tensor<48x48x3x256xf32>}> : () -> tensor<48x48x3x256xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
 // CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]])
 // CHECK-NEXT: return %[[BIASADD]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
index 4da50d4ac91b31..0176867dd48865 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
@@ -15,11 +15,11 @@ module {
 // CHECK-DAG: %[[CONST:.*]] = arith.constant dense<0.000000e+00> : tensor<2x1024xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x1024xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %2 : tensor<*xf32>
 
 // CHECK-LABEL: func private @composite_matmul_fn
-// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %0 : tensor<*xf32>
 }
 
@@ -43,7 +43,7 @@ module {
 // CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x3x512xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x3x512xf32>) -> tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x3x512xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
 // CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
 // CHECK: return %3 : tensor<*xf32>
 
@@ -74,7 +74,7 @@ module {
 // CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x1x1536xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x1x1536xf32>) -> tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x1x1536xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
 // CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
 // CHECK: return %3 : tensor<*xf32>
 
@@ -85,6 +85,6 @@ module {
 // CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
 // CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
 // CHECK-SAME:                                             %arg1: tensor<2x3x1x1536xf32>)
-// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
 // CHECK: return %0 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir
index f2d80c0bf4e01a..927fc34fb564dd 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir
@@ -15,11 +15,11 @@ module {
 // CHECK-DAG: %[[CONST:.*]] = arith.constant dense<0.000000e+00> : tensor<2x1024xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x1024xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %2 : tensor<*xf32>
 
 // CHECK-LABEL: func private @composite_matmul_fn
-// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %0 : tensor<*xf32>
 }
 
@@ -43,7 +43,7 @@ module {
 // CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x512x2xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x512x2xf32>) -> tensor<2x3x512x2x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,0.023622047244094488}>>
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x512x2x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,0.023622047244094488}>>) -> tensor<2x3x512x2xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x512xf32>, tensor<2x3x512x2xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x512xf32>, tensor<2x3x512x2xf32>) -> tensor<*xf32>
 // CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
 // CHECK: return %3 : tensor<*xf32>
 
@@ -74,7 +74,7 @@ module {
 // CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x1x1536xf32>
 // CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x1x1536xf32>) -> tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,
 // CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
 // CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
 // CHECK: return %3 : tensor<*xf32>
 
@@ -85,6 +85,6 @@ module {
 // CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
 // CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
 // CHECK-SAME:                                             %arg1: tensor<2x3x1x1536xf32>)
-// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
 // CHECK: return %0 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir
index 0ef69f6f6f791f..ae8a20d6b82fa0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir
@@ -22,8 +22,8 @@ module {
 // CHECK: %[[CONST_1:.*]] = arith.constant dense
 // CHECK-NOT: tensor<2x3x3x2xf32>
 // CHECK-SAME: tensor<2x3x1x6xf32>
-// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
-// CHECK: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
+// CHECK: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
 // CHECK: return %[[BIAS_0:.*]] : tensor<*xf32>
 
 // CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
@@ -33,7 +33,7 @@ module {
 // CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
 // CHECK-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
 // CHECK-SAME:                                             %arg1: tensor<2x3x1x6xf32>)
-// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
 // CHECK: return %0 : tensor<*xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir
index 4f36784e67a6d4..e80db7ff049b5b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir
@@ -23,14 +23,14 @@ module {
 // PerTensor: %[[CONST_1:.*]] = arith.constant dense
 // PerTensor-NOT: tensor<2x3x1x6xf32>
 // PerTensor-SAME: tensor<2x3x3x2xf32>
-// PerTensor: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
-// PerTensor: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// PerTensor: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// PerTensor: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
 // PerTensor: return %[[BIAS_0:.*]] : tensor<*xf32>
 
 // PerTensor-LABEL: func private @composite_depthwise_conv2d_fn(
 // PerTensor-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
 // PerTensor-SAME:                                             %arg1: tensor<2x3x3x2xf32>)
-// PerTensor: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// PerTensor: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
 // PerTensor: return %0 : tensor<*xf32>
 
 // PerChannel-LABEL: func @depthwise_conv
@@ -38,8 +38,8 @@ module {
 // PerChannel: %[[CONST_1:.*]] = arith.constant dense
 // PerChannel-NOT: tensor<2x3x3x2xf32>
 // PerChannel-SAME: tensor<2x3x1x6xf32>
-// PerChannel: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
-// PerChannel: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// PerChannel: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
+// PerChannel: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
 // PerChannel: return %[[BIAS_0:.*]] : tensor<*xf32>
 
 // PerChannel-LABEL: func private @composite_depthwise_conv2d_fn(
@@ -49,7 +49,7 @@ module {
 // PerChannel-LABEL: func private @composite_depthwise_conv2d_fn_0(
 // PerChannel-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
 // PerChannel-SAME:                                             %arg1: tensor<2x3x1x6xf32>)
-// PerChannel: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// PerChannel: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
 // PerChannel: return %0 : tensor<*xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/propagate_quantize_type.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/propagate_quantize_type.mlir
index 0c69477a0c8c42..6a737b7b20e1e4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/propagate_quantize_type.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/propagate_quantize_type.mlir
@@ -12,8 +12,8 @@ module {
   }
 
 // CHECK-LABEL: func @not_propagate_matmul
-// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%0) {Truncate = false} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %2 = "tf.MatMul"(%arg0, %[[CASTED_W]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%0) <{Truncate = false}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %2 = "tf.MatMul"(%arg0, %[[CASTED_W]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 }
 
 // -----
@@ -37,8 +37,8 @@ module {
 
 // CHECK-LABEL: func @propagate_xladotv2_bf16
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[IDENTITY]]) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xi8>) -> tensor<1x2x2x1024xbf16>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[MATMUL]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1x2x2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[IDENTITY]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xi8>) -> tensor<1x2x2x1024xbf16>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[MATMUL]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1x2x2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
 }
 
 // -----
@@ -64,8 +64,8 @@ module {
 
 // CHECK-LABEL: func @not_propagate_last_op
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
-// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %cst) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %cst) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
 // CHECK: return %[[GATHER]] : tensor<1x300x10xf32>
 
 // -----
@@ -91,7 +91,7 @@ module {
 
 // CHECK-LABEL: func @propagate_xlagather
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
-// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[IDENTITY]], %arg0, %cst) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true} : (tensor<200x100x300xi8>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[GATHER]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1x300x10xi8>) -> tensor<1x300x10xf32>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[IDENTITY]], %arg0, %cst) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xi8>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[GATHER]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1x300x10xi8>) -> tensor<1x300x10xf32>
 // CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
 // CHECK: return %[[ORIGINAL_IDENTITY]] : tensor<1x300x10xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
index d04ec262f6ff85..0f3c7024dba4b4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
@@ -23,7 +23,7 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
 // CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
 // CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
-// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) <{config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
 // CHECK-NEXT: [[res:%.+]] = "quantfork.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
 // CHECK-NEXT: "func.return"([[res]]) : (tensor<*xf32>) -> ()
 
@@ -69,11 +69,11 @@ func.func @avgpool_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK: %[[q:.*]] = "quantfork.qcast"(%arg0)
 // CHECK: %[[sc1:.*]] = "quantfork.scast"(%[[q]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
-// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[sc1]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[sc1]]) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
 // CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"(%[[fcast]])
 // CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
 // CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool_f32]])
-// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi8>
 // CHECK: %[[sc2:.*]] = "quantfork.scast"(%[[icast]])
 // CHECK: %[[dq:.*]] = "quantfork.dcast"(%[[sc2]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
 // CHECK: return %[[dq]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
index eab1244fb62b3d..5b5addde086898 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
@@ -28,17 +28,17 @@ module {
   }
 
 // CHECK-LABEL: func @conv
-// CHECK-DAG: %[[w_float:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}1.600000e-01
-// CHECK-DAG: %[[b_float:.*]] = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>
-// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() {value = dense<8.000000e-03> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[in_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() {value = dense<[4.000000e-03
-// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>}
-// CHECK-DAG: %[[b_scale:.*]] = "tf.Const"() {value = dense<[3.200000e-05, 4.000000e-05]> : tensor<2xf32>}
-// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() {value = dense<5.000000e-02> : tensor<f32>}
-// CHECK-DAG: %[[out_zp:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>}
-// CHECK-DAG: %[[b_quant:.*]] = "tf.Const"() {value = dense<[-62500, 75000]> : tensor<2xi32>}
-// CHECK-DAG: %[[w_quant:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}40, 20]
+// CHECK-DAG: %[[w_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01
+// CHECK-DAG: %[[b_float:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<8.000000e-03> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[in_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() <{value = dense<[4.000000e-03
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}>
+// CHECK-DAG: %[[b_scale:.*]] = "tf.Const"() <{value = dense<[3.200000e-05, 4.000000e-05]> : tensor<2xf32>}
+// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() <{value = dense<5.000000e-02> : tensor<f32>}>
+// CHECK-DAG: %[[out_zp:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}>
+// CHECK-DAG: %[[b_quant:.*]] = "tf.Const"() <{value = dense<[-62500, 75000]> : tensor<2xi32>}>
+// CHECK-DAG: %[[w_quant:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}40, 20]
 // CHECK-DAG: {{\[\[\[}}-87, -42]
 
 // CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_zp]])
@@ -58,7 +58,8 @@ module {
 
 // CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1
 // CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
-// CHECK-SAME: data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: device = ""
 // CHECK:      %[[BIASADD_0:.*]] = "tf.BiasAdd"
 // CHECK:      %[[RELU6_0:.*]] = "tf.Relu6"
 
@@ -158,10 +159,10 @@ module {
 // CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]]
 // CHECK-SAME: f = @quantized_conv2d_with_bias_and_relu6_fn_0
 // CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
-// CHECK: %[[cast_1:.*]] = "tf.Cast"(%[[conv_quant]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
-// CHECK: %[[avgpool:.*]] = "tf.AvgPool"(%[[cast_1]]) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[cast_1:.*]] = "tf.Cast"(%[[conv_quant]]) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[avgpool:.*]] = "tf.AvgPool"(%[[cast_1]]) <{data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK: %[[cast_2:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[cast_2:.*]] = "tf.Cast"(%[[round]]) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi8>
 // CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[cast_2]]
 // CHECK-SAME: f = @dequantize_i8
 // CHECK: return %[[dequantize]]
@@ -252,35 +253,35 @@ module {
   }
 
 // CHECK-LABE: @conv_with_dump
-// CHECK-DAG: %[[w0_float:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.282878935, -0.211567819
-// CHECK-DAG: %[[b0_float:.*]] = "tf.Const"() {value = dense<[-0.0192535277, -5.998660e-03]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[w1_float:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}0.208403707, 0.478067577
-// CHECK-DAG: %[[b1_float:.*]] = "tf.Const"() {value = dense<[-0.0291469581, 0.0106381178]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[w0_quantized:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-59, -44
-// CHECK-DAG: %[[b0_quantized:.*]] = "tf.Const"() {value = dense<[-1040, -324]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[w1_quantized:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}44, 100
-// CHECK-DAG: %[[b1_quantized:.*]] = "tf.Const"() {value = dense<[-4312, 1574]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() {value = dense<0.00387597573> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[in_out_zp:.*]] = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[w0_scale:.*]] = "tf.Const"() {value = dense<0.00477493973> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w_b_zp:.*]]  = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[b0_scale:.*]] = "tf.Const"() {value = dense<1.85075514E-5> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[mid_scale:.*]] = "tf.Const"() {value = dense<0.00141507247> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w1_scale:.*]] = "tf.Const"() {value = dense<0.00477652298> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[b1_scale:.*]] = "tf.Const"() {value = dense<6.75912588E-6> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() {value = dense<7.24974147E-4> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[arg_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantize_i8}
-// CHECK-DAG: %[[conv0_quantized:.*]] = "tf.PartitionedCall"(%[[arg_quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_1}
-// CHECK-DAG: %[[conv0_dequantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[mid_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8}
-// CHECK-DAG: %[[conv1_quantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[w1_quantized]], %[[b1_quantized]], %[[mid_scale]], %[[in_out_zp]], %[[w1_scale]], %[[w_b_zp]], %[[b1_scale]], %[[w_b_zp]], %[[out_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_0}
-// CHECK-DAG: %[[conv1_dequantized_0:.*]] = "tf.PartitionedCall"(%[[conv1_quantized]], %[[out_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8}
-// CHECK-DAG: %[[conv1_dequantized_1:.*]] = "tf.PartitionedCall"(%[[conv1_quantized]], %[[out_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8}
+// CHECK-DAG: %[[w0_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.282878935, -0.211567819
+// CHECK-DAG: %[[b0_float:.*]] = "tf.Const"() <{value = dense<[-0.0192535277, -5.998660e-03]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[w1_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.208403707, 0.478067577
+// CHECK-DAG: %[[b1_float:.*]] = "tf.Const"() <{value = dense<[-0.0291469581, 0.0106381178]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[w0_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-59, -44
+// CHECK-DAG: %[[b0_quantized:.*]] = "tf.Const"() <{value = dense<[-1040, -324]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[w1_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}44, 100
+// CHECK-DAG: %[[b1_quantized:.*]] = "tf.Const"() <{value = dense<[-4312, 1574]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<0.00387597573> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[in_out_zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[w0_scale:.*]] = "tf.Const"() <{value = dense<0.00477493973> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w_b_zp:.*]]  = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[b0_scale:.*]] = "tf.Const"() <{value = dense<1.85075514E-5> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[mid_scale:.*]] = "tf.Const"() <{value = dense<0.00141507247> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w1_scale:.*]] = "tf.Const"() <{value = dense<0.00477652298> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[b1_scale:.*]] = "tf.Const"() <{value = dense<6.75912588E-6> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() <{value = dense<7.24974147E-4> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[arg_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantize_i8}>
+// CHECK-DAG: %[[conv0_quantized:.*]] = "tf.PartitionedCall"(%[[arg_quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-DAG: %[[conv0_dequantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[mid_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @dequantize_i8}>
+// CHECK-DAG: %[[conv1_quantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[w1_quantized]], %[[b1_quantized]], %[[mid_scale]], %[[in_out_zp]], %[[w1_scale]], %[[w_b_zp]], %[[b1_scale]], %[[w_b_zp]], %[[out_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_0}>
+// CHECK-DAG: %[[conv1_dequantized_0:.*]] = "tf.PartitionedCall"(%[[conv1_quantized]], %[[out_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @dequantize_i8}>
+// CHECK-DAG: %[[conv1_dequantized_1:.*]] = "tf.PartitionedCall"(%[[conv1_quantized]], %[[out_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @dequantize_i8}>
 // CHECK-DAG: %[[identity:.*]] = "tf.Identity"(%[[conv1_dequantized_1]])
-// CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}
-// CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized_0]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}> {device = ""}
+// CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> {device = ""}
 // CHECK-DAG: return %[[identity]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
index 0731e6bfe8e975..6ec99282a8e252 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
@@ -13,11 +13,11 @@ module {
   }
 
 // CHECK-LABEL: func @matmul
-// CHECK-DAG: %[[q_w:.*]]  = "tf.Const"() {value = #tf_type<tensor_proto : "0x746
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: %0 = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-DAG: %[[q_w:.*]]  = "tf.Const"() <{value = #tf_type<tensor_proto : "0x746
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<3.93700805E-9> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %0 = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @quantized_matmul_fn_0}> : (tensor<2x12xf32>, tensor<12x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 
 // CHECK-LABEL: func private @quantized_matmul_fn_0
 // CHECK:  %0 = "tf.UniformQuantizedDotHybrid"(%arg0, %arg1, %arg2, %arg3)
@@ -48,11 +48,11 @@ module {
   }
 
 // CHECK-LABEL: func @conv
-// CHECK-DAG: %[[q_w:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
-// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-DAG: %[[q_w:.*]] = "tf.Const"() <{value = #tf_type<tensor_proto : "0x746674
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_1}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_0}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // CHECK: return %[[quantize_1]], %[[quantize_2]]
 
 // CHECK-LABEL: func private @quantized_conv2d_fn_0
@@ -102,17 +102,17 @@ module {
   }
 
 // CHECK-LABEL: func @depthwise_conv
-// CHECK-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-DAG: %[[q_w1:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
+// CHECK-DAG: %[[bias:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-DAG: %[[q_w1:.*]] = "tf.Const"() <{value = #tf_type<tensor_proto : "0x746674
 // CHECK-SAME:                                                                     -> tensor<2x3x1x3x!tf_type.qint8>
-// CHECK-DAG: %[[q_w2:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
+// CHECK-DAG: %[[q_w2:.*]] = "tf.Const"() <{value = #tf_type<tensor_proto : "0x746674
 // CHECK-SAME:                                                                     -> tensor<2x3x1x6x!tf_type.qint8>
-// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() <{value = dense<0.0236220472> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 
-// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x1x3x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: %[[quantize_1_add:.*]] = "tf.BiasAdd"(%[[quantize_1]], %[[bias]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[w_scale]], %[[w_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_1}> : (tensor<1x3x4x3xf32>, tensor<2x3x1x3x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: %[[quantize_1_add:.*]] = "tf.BiasAdd"(%[[quantize_1]], %[[bias]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[w_scale]], %[[w_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_0}> : (tensor<1x3x4x3xf32>, tensor<2x3x1x6x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // CHECK: return %[[quantize_1_add]], %[[quantize_2]]
 
 // CHECK-LABEL: func private @quantized_depthwise_conv2d_fn_0
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
index 8c0786178ee37e..ba8f21380b33e6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
@@ -15,19 +15,19 @@ module {
 }
 
 // PerTensor-LABEL: func @matmul
-// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
-// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
-// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// PerTensor: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerTensor-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() <{value = dense<0> : tensor<12x2xi8>}> : () -> tensor<12x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<3.93700805E-9> : tensor<f32>}> : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// PerTensor: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_matmul_fn_0}> : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // PerTensor: return %[[out]]
 
 // PerChannel-LABEL: func @matmul
-// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
-// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
-// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// PerChannel: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerChannel-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() <{value = dense<0> : tensor<12x2xi8>}> : () -> tensor<12x2xi8>
+// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<3.93700805E-9> : tensor<f32>}> : () -> tensor<f32>
+// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// PerChannel: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_matmul_fn_0}> : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // PerChannel: return %[[out]]
 
 // -----
@@ -51,23 +51,23 @@ module {
   }
 
 // PerTensor-LABEL: func @conv
-// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<f32>} : () -> tensor<f32>
-// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<i32>} : () -> tensor<i32>
-// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerTensor-SAME: f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerTensor-SAME: f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<f32>}> : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<i32>}> : () -> tensor<i32>
+// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_conv2d_fn_1}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_conv2d_fn_0}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // PerTensor: return %[[out_1]], %[[out_2]]
 
 // PerChannel-LABEL: func @conv
-// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<2xf32>} : () -> tensor<2xf32>
-// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2xi32>} : () -> tensor<2xi32>
-// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerChannel-SAME: f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
-// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerChannel-SAME: f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
+// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_conv2d_fn_1}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
+// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_conv2d_fn_0}> : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
 // PerChannel: return %[[out_1]], %[[out_2]]
 
 }
@@ -98,30 +98,30 @@ module {
   }
 
 // PerTensor-LABEL: func @depthwise_conv
-// PerTensor-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x1xi8>}
-// PerTensor-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
-// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// PerTensor-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>}
-// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor-DAG: %[[q_w1:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x1xi8>}>
+// PerTensor-DAG: %[[q_w2:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<0.0236220472> : tensor<f32>}> : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// PerTensor-DAG: %[[bias:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3xf32>}>
+// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_1}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // PerTensor: %[[out_1_add:.*]]  = "tf.BiasAdd"(%[[out_1]], %[[bias]])
-// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale]], %[[zp]]) <{config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_0}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 // PerTensor: return %[[out_1_add]], %[[out_2]]
 
 // PerChannel-LABEL: func @depthwise_conv
-// PerChannel-DAG: %[[bias1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-// PerChannel-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x1xi8>} : () -> tensor<2x3x3x1xi8>
-// PerChannel-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// PerChannel-DAG: %[[scale1:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<3xf32>} : () -> tensor<3xf32>
-// PerChannel-DAG: %[[scale2:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<6xf32>} : () -> tensor<6xf32>
-// PerChannel-DAG: %[[zp1:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<3xi32>} : () -> tensor<3xi32>
-// PerChannel-DAG: %[[zp2:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<6xi32>} : () -> tensor<6xi32>
-// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale1]], %[[zp1]]) {config = "", config_proto = "", executor_type = "",
-// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<3xf32>, tensor<3xi32>) -> tensor<*xf32>
+// PerChannel-DAG: %[[bias1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<3xf32>}> : () -> tensor<3xf32>
+// PerChannel-DAG: %[[q_w1:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<2x3x3x1xi8>}> : () -> tensor<2x3x3x1xi8>
+// PerChannel-DAG: %[[q_w2:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// PerChannel-DAG: %[[scale1:.*]] = "tf.Const"() <{value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<3xf32>}> : () -> tensor<3xf32>
+// PerChannel-DAG: %[[scale2:.*]] = "tf.Const"() <{value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<6xf32>}> : () -> tensor<6xf32>
+// PerChannel-DAG: %[[zp1:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<3xi32>}> : () -> tensor<3xi32>
+// PerChannel-DAG: %[[zp2:.*]] = "tf.Const"() <{value = dense<{{[0-9]+}}> : tensor<6xi32>}> : () -> tensor<6xi32>
+// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale1]], %[[zp1]]) <{config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_1}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<3xf32>, tensor<3xi32>) -> tensor<*xf32>
 // PerChannel: %[[out_1_add:.*]]  = "tf.BiasAdd"(%[[out_1]], %[[bias1]])
-// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale2]], %[[zp2]]) {config = "", config_proto = "", executor_type = "",
-// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<6xf32>, tensor<6xi32>) -> tensor<*xf32>
+// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale2]], %[[zp2]]) <{config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_0}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<6xf32>, tensor<6xi32>) -> tensor<*xf32>
 // PerChannel: return %[[out_1_add]], %[[out_2]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
index 38f0662e097063..38b41273f3016b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
@@ -1,4 +1,5 @@
 // RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions='target-opset=XLA' | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions='target-opset=XLA enable-per-channel-quantization=true' | FileCheck --check-prefix=PerChannel %s
 
 module {
   func.func @conv_with_single_layer(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
@@ -32,7 +33,7 @@ module {
 // CHECK-LABEL: func private @quantized_conv2d_with_bias_and_relu6_float_output_fn_0
 // CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xf32>
 // CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
-// CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+// CHECK-SAME: <{dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}>
 
 // CHECK: -------- Quantization Summary --------
 // CHECK: Number of quantized layers in the model
@@ -123,7 +124,7 @@ module {
 // CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]]
 // CHECK-SAME: f = @quantized_conv2d_with_bias_and_relu6_fn_0
 // CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
-// CHECK: %[[maxpool:.*]] = "tf.MaxPool"(%[[conv_quant]]) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<*xi8>) -> tensor<*xi8>
+// CHECK: %[[maxpool:.*]] = "tf.MaxPool"(%[[conv_quant]]) <{data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<*xi8>) -> tensor<*xi8>
 // CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[maxpool]]
 // CHECK-SAME: f = @dequantize_i8
 // CHECK: return %[[dequantize]]
@@ -297,34 +298,94 @@ module {
     func.return %2 : tensor<*xf32>
   }
 
-// CHECK-LABE: @conv_with_dump
-// CHECK-DAG: %[[w0_float:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-0.282878935, -0.211567819
-// CHECK-DAG: %[[b0_float:.*]] = "tf.Const"() {value = dense<[-0.0192535277, -5.998660e-03]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[w1_float:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}0.208403707, 0.478067577
-// CHECK-DAG: %[[b1_float:.*]] = "tf.Const"() {value = dense<[-0.0291469581, 0.0106381178]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[w0_quantized:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}-59, -44
-// CHECK-DAG: %[[b0_quantized:.*]] = "tf.Const"() {value = dense<[-1040, -324]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[w1_quantized:.*]] = "tf.Const"() {value = dense<{{\[\[\[\[}}44, 100
-// CHECK-DAG: %[[b1_quantized:.*]] = "tf.Const"() {value = dense<[-4312, 1574]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() {value = dense<0.00387597573> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[in_out_zp:.*]] = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[w0_scale:.*]] = "tf.Const"() {value = dense<0.00477493973> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w_b_zp:.*]]  = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[b0_scale:.*]] = "tf.Const"() {value = dense<1.85075514E-5> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[mid_scale:.*]] = "tf.Const"() {value = dense<0.00141507247> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[w1_scale:.*]] = "tf.Const"() {value = dense<0.00477652298> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[b1_scale:.*]] = "tf.Const"() {value = dense<6.75912588E-6> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() {value = dense<7.24974147E-4> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantize_i8}
-// CHECK-DAG: %[[conv0_dequantized:.*]] = "tf.PartitionedCall"(%[[quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_float_output_fn_1}
-// CHECK-DAG: %[[conv0_quantized:.*]] = "tf.PartitionedCall"(%[[quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_1}
-// CHECK-DAG: %[[conv1_dequantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[w1_quantized]], %[[b1_quantized]], %[[mid_scale]], %[[in_out_zp]], %[[w1_scale]], %[[w_b_zp]], %[[b1_scale]], %[[w_b_zp]], %[[out_scale]], %[[in_out_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_float_output_fn_0}
+// CHECK-LABEL: func @conv_with_dump
+// CHECK-DAG: %[[w0_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.282878935, -0.211567819
+// CHECK-DAG: %[[b0_float:.*]] = "tf.Const"() <{value = dense<[-0.0192535277, -5.998660e-03]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[w1_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.208403707, 0.478067577
+// CHECK-DAG: %[[b1_float:.*]] = "tf.Const"() <{value = dense<[-0.0291469581, 0.0106381178]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[w0_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-59, -44
+// CHECK-DAG: %[[b0_quantized:.*]] = "tf.Const"() <{value = dense<[-1040, -324]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[w1_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}44, 100
+// CHECK-DAG: %[[b1_quantized:.*]] = "tf.Const"() <{value = dense<[-4312, 1574]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<0.00387597573> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[in_out_zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[w0_scale:.*]] = "tf.Const"() <{value = dense<0.00477493973> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w_b_zp:.*]]  = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[b0_scale:.*]] = "tf.Const"() <{value = dense<1.85075514E-5> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[mid_scale:.*]] = "tf.Const"() <{value = dense<0.00141507247> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w1_scale:.*]] = "tf.Const"() <{value = dense<0.00477652298> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[b1_scale:.*]] = "tf.Const"() <{value = dense<6.75912588E-6> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() <{value = dense<7.24974147E-4> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantize_i8}>
+// CHECK-DAG: %[[conv0_dequantized:.*]] = "tf.PartitionedCall"(%[[quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_float_output_fn_1}>
+// CHECK-DAG: %[[conv0_quantized:.*]] = "tf.PartitionedCall"(%[[quantized]], %[[w0_quantized]], %[[b0_quantized]], %[[in_scale]], %[[in_out_zp]], %[[w0_scale]], %[[w_b_zp]], %[[b0_scale]], %[[w_b_zp]], %[[mid_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-DAG: %[[conv1_dequantized:.*]] = "tf.PartitionedCall"(%[[conv0_quantized]], %[[w1_quantized]], %[[b1_quantized]], %[[mid_scale]], %[[in_out_zp]], %[[w1_scale]], %[[w_b_zp]], %[[b1_scale]], %[[w_b_zp]], %[[out_scale]], %[[in_out_zp]]) <{config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu6_float_output_fn_0}>
 // CHECK-DAG: %[[identity:.*]] = "tf.Identity"(%[[conv1_dequantized]])
-// CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}
-// CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) {config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized]]) {device = "", enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
-// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) {device = "", enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// CHECK-DAG: %[[conv0_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w0_float]], %[[b0_float]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_00}> {device = ""}
+// CHECK-DAG: %[[conv1_float:.*]] = "tf.PartitionedCall"(%[[conv0_dequantized]], %[[w1_float]], %[[b1_float]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_00}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_dequantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv0_float]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_dequantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> {device = ""}
+// CHECK-DAG: "tf.DumpTensor"(%[[conv1_float]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv_with_dump", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> {device = ""}
 // CHECK-DAG: return %[[identity]]
+
+// PerChannel-LABEL: func @conv_with_dump
+// PerChannel-DAG: %[[PerChannel_w0_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.282878935, -0.211567819
+// PerChannel-DAG: %[[b0_float:.*]] = "tf.Const"() <{value = dense<[-0.0192535277, -5.998660e-03]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[w1_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.208403707, 0.478067577
+// PerChannel-DAG: %[[b1_float:.*]] = "tf.Const"() <{value = dense<[-0.0291469581, 0.0106381178]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[w0_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-59, -77
+// PerChannel-DAG: %[[b0_quantized:.*]] = "tf.Const"() <{value = dense<[-1040, -561]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel-DAG: %[[w1_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}45, 100
+// PerChannel-DAG: %[[b1_quantized:.*]] = "tf.Const"() <{value = dense<[-4411, 1574]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<0.00387597573> : tensor<f32>}> : () -> tensor<f32>
+// PerChannel-DAG: %[[in_out_zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
+// PerChannel-DAG: %[[w0_scale:.*]] = "tf.Const"() <{value = dense<[0.00477493973, 0.00275693159]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[w_b_zp:.*]]  = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel-DAG: %[[b0_scale:.*]] = "tf.Const"() <{value = dense<[1.85075514E-5, 1.06858006E-5]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[mid_scale:.*]] = "tf.Const"() <{value = dense<0.00141507247> : tensor<f32>}> : () -> tensor<f32>
+// PerChannel-DAG: %[[w1_scale:.*]] = "tf.Const"() <{value = dense<[0.00467005931, 0.00477652298]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[b1_scale:.*]] = "tf.Const"() <{value = dense<[6.60847217E-6, 6.75912588E-6]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[out_scale:.*]] = "tf.Const"() <{value = dense<7.24974147E-4> : tensor<f32>}> : () -> tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @conv_with_per_channel_and_tensor_weight(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-0.630731344, 0.277245104], [0.54962182, 0.927732646], [0.180364341, 1.90948534]], [[-0.764542698, -0.287541777], [-0.211145893, -1.59367061], [-0.708605706, 1.79999375]], [[-0.954062759, 0.197947085], [-0.614013135, -0.966769516], [0.612640202, -1.45540595]]], [[[-0.418223292, 0.234433219], [5.057390e-01, 1.86747122], [0.899269938, 0.145780042]], [[0.335351914, 1.02572429], [0.084816426, 1.79729116], [-0.664676845, 0.310017586]], [[-0.795477629, -7.709830e-01], [0.581315517, 0.740075528], [0.921566545, 1.85318887]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[4.6128589E-5, 0.999998927]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.PartitionedCall"(%0, %cst_0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[3.50919247, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    %4 = "quantfork.stats"(%3) {layerStats = dense<[3.50919247, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    func.return %4 : tensor<1x3x4x2xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "composite_conv2d_with_bias_and_relu6_fn_1", tf.tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    func.return %2 : tensor<1x3x4x2xf32>
+  }
+
+// CHECK-LABEL: func @conv_with_per_channel_and_tensor_weight
+// CHECK-DAG: %[[b0_quantized:.*]] = "tf.Const"() <{value = dense<[120654, 119646]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[w0_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-42, 18
+// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<0.0039215642> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[in_out_zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[w0_scale:.*]] = "tf.Const"() <{value = dense<0.0150353173> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[w_b_zp:.*]]  = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[b0_scale:.*]] = "tf.Const"() <{value = dense<5.89619667E-5> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[mid_scale:.*]] = "tf.Const"() <{value = dense<0.0235294122> : tensor<f32>}> : () -> tensor<f32>
+
+// PerChannel-LABEL: func @conv_with_per_channel_and_tensor_weight
+// PerChannel-DAG: %[[b0_quantized:.*]] = "tf.Const"() <{value = dense<[241481, 119646]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel-DAG: %[[w0_quantized:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-84, 18
+// PerChannel-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<0.0039215642> : tensor<f32>}> : () -> tensor<f32>
+// PerChannel-DAG: %[[in_out_zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
+// PerChannel-DAG: %[[w0_scale:.*]] = "tf.Const"() <{value = dense<[0.0075123054, 0.0150353173]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[w_b_zp:.*]]  = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
+// PerChannel-DAG: %[[b0_scale:.*]] = "tf.Const"() <{value = dense<[2.94599886E-5, 5.89619667E-5]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// PerChannel-DAG: %[[mid_scale:.*]] = "tf.Const"() <{value = dense<0.0235294122> : tensor<f32>}> : () -> tensor<f32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
index c500b3c72e8c86..e3bda3f5d09af9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
@@ -15,7 +15,7 @@ module {
 
 // CHECK: %[[cst:.*]] = "arith.constant"() <{value = dense<0.000000e+00> : tensor<2x1024xf32>}> : () -> tensor<2x1024xf32>
 // CHECK: %[[q_cst:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
-// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
+// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
 // CHECK: "func.return"(%[[out]]) : (tensor<*xf32>) -> ()
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
index c41d43d0f662fb..7f7a5090439e28 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
@@ -8,7 +8,7 @@ module {
   }
 
 // CHECK-LABEL: func @not_quantize_const
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xf32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x1024xf32>
 // CHECK: return %[[W]] : tensor<2x1024xf32>
 }
 
@@ -22,15 +22,15 @@ module {
   }
 
 // CHECK-LABEL: func @matmul
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL]] : tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>
-// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
+// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
 // CHECK: %[[DEQUANTIZED:.*]] = "tf.Mul"(%[[CASTED_W]], %[[SCALE]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
 // CHECK: return %[[DEQUANTIZED]] : tensor<*xf32>
 }
@@ -48,7 +48,7 @@ module {
 // CHECK-LABEL: func @not_quantize_matmul_without_const
 // CHECK: %[[ORIGINAL_IDENTITY_1:.*]] = "tf.Identity"(%arg0) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
 // CHECK: %[[ORIGINAL_IDENTITY_2:.*]] = "tf.Identity"(%arg1) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%[[ORIGINAL_IDENTITY_1]], %[[ORIGINAL_IDENTITY_2]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%[[ORIGINAL_IDENTITY_1]], %[[ORIGINAL_IDENTITY_2]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL]] : tensor<*xf32>
 }
 
@@ -63,14 +63,14 @@ module {
   }
 
 // CHECK-LABEL: func @quantize_xladotv2_bf16
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xbf16>
-// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[DEQUANTIZED]]) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xbf16>
+// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[DEQUANTIZED]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
 // CHECK: return %[[MATMUL]] : tensor<1x2x2x1024xbf16>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xbf16>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<1.574710e-02> : tensor<bf16>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<1.574710e-02> : tensor<bf16>
 }
 
 // -----
@@ -87,17 +87,17 @@ module {
   }
 
 // CHECK-LABEL: func @matmul_with_identity_and_reshape
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<1024x2xi8>
-// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[2, 1024]> : tensor<2xi32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x2xi8>
+// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 1024]> : tensor<2xi32>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x2xi8>) -> tensor<1024x2xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1024x2xi8>) -> tensor<1024x2xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x2xi8>) -> tensor<1024x2xf32>
 // CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<1024x2xf32>) -> tensor<1024x2xf32>
 // CHECK: %[[RESHAPED_W:.*]] = "tf.Reshape"(%[[ORIGINAL_IDENTITY]], %[[SHAPE]]) : (tensor<1024x2xf32>, tensor<2xi32>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[RESHAPED_W]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[RESHAPED_W]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL]] : tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
 }
 
 // -----
@@ -113,16 +113,16 @@ module {
   }
 
 // CHECK-LABEL: func @conv2d
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x512xi8>
-// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x512xi8>
+// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
-// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[DEQUANTIZED:.*]]) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
-// CHECK: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[BIAS]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
+// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[DEQUANTIZED:.*]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[BIAS]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK: return %[[BIASADD]] : tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0236220472> : tensor<f32>
 }
 
 // -----
@@ -138,14 +138,14 @@ module {
   }
 
 // CHECK-LABEL: func @depthwise_conv
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x512xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x512xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
-// CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[DEQUANTIZED]]) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
+// CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[DEQUANTIZED]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
 // CHECK: return %[[DEPTHWISE_CONV2D]] : tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.00787401571> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00787401571> : tensor<f32>
 }
 
 // -----
@@ -160,16 +160,16 @@ module {
   }
 
 // CHECK-LABEL: func @quantize_sharded_weights_with_xladot
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<512x512xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<512x512xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<512x512xi8>) -> tensor<512x512xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<512x512xi8>) -> tensor<512x512xf32>
-// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[DEQUANTIZED]]) {_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", device = "", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
-// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
-// CHECK: %[[ORIGINAL_CAST:.*]] = "tf.Cast"(%[[XLADOT]]) {Truncate = false} : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xbf16>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<512x512xi8>) -> tensor<512x512xf32>
+// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[DEQUANTIZED]]) <{_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01"}> {device = "", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+// CHECK: %[[ORIGINAL_CAST:.*]] = "tf.Cast"(%[[XLADOT]]) <{Truncate = false}> : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xbf16>
 // CHECK: return %[[ORIGINAL_CAST]] : tensor<?x?x?x?xbf16>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0787401571> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>
 }
 
 // -----
@@ -184,16 +184,16 @@ module {
   }
 
 // CHECK-LABEL: func @quantize_sharded_weights_with_xladot_with_identity
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<512x512xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<512x512xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<512x512xi8>) -> tensor<512x512xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<512x512xi8>) -> tensor<512x512xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<512x512xi8>) -> tensor<512x512xf32>
 // CHECK: %[[IDENTITY_W:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<512x512xf32>) -> tensor<512x512xf32>
-// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[IDENTITY_W]]) {_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", device = "", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
-// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[IDENTITY_W]]) <{_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01"}> {device = "", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
 // CHECK: return %[[XLADOT]] : tensor<?x?x?x?xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0787401571> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>
 }
 
 // -----
@@ -208,16 +208,16 @@ module {
   }
 
 // CHECK-LABEL: func @quantize_xlagather
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<200x100x300xi8>} : () -> tensor<200x100x300xi8>
-// CHECK-DAG: %[[IDX:.*]] = "tf.Const"() {value = dense<[1, 1, 300]> : tensor<3xi64>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<200x100x300xi8>}> : () -> tensor<200x100x300xi8>
+// CHECK-DAG: %[[IDX:.*]] = "tf.Const"() <{value = dense<[1, 1, 300]> : tensor<3xi64>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
-// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %[[IDX]]) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %[[IDX]]) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[GATHER]]) {device = ""} : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
 // CHECK: return %[[IDENTITY]] : tensor<1x300x10xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0787401571> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>}> : () -> tensor<f32>
 }
 
 // -----
@@ -236,17 +236,17 @@ module {
   }
 
 // CHECK-LABEL: func @partitioned_call
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[OUTPUT]] : tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0314960629> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0314960629> : tensor<f32>
 
 // CHECK-LABEL: func private @composite_matmul_fn
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL]] : tensor<*xf32>
 }
 
@@ -272,21 +272,21 @@ module {
 }
 
 // CHECK-LABEL: func @recursive_partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> tensor<*xf32>
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) {config = "", config_proto = "", executor_type = "", f = @outer_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) <{config = "", config_proto = "", executor_type = "", f = @outer_fn}> : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[OUTPUT]] : tensor<*xf32>
 
 // CHECK-LABEL: func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0314960629> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0314960629> : tensor<f32>
 
 // CHECK-LABEL: func private @outer_fn
-// CHECK: %[[OUTER_OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @inner_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[OUTER_OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @inner_fn}> : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[OUTER_OUTPUT]] : tensor<*xf32>
 
 // CHECK-LABEL: func private @inner_fn
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL]] : tensor<*xf32>
 
 // -----
@@ -302,17 +302,17 @@ module {
   }
 
 // CHECK-LABEL: func @matmul_multiuses
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
-// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg1, %[[DEQUANTIZED]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg1, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[ORIGINAL_IDENTITY]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[ORIGINAL_IDENTITY]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]] : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
 }
 
 // -----
@@ -327,15 +327,15 @@ module {
   }
 
 // CHECK-LABEL: func @matmul_multiuses
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
-// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg1, %[[DEQUANTIZED]]) {device = ""} : (tensor<2x1024xf32>, tensor<2x1024xf32>) -> tensor<2x1024xf32>
 // CHECK: return %[[MATMUL]], %[[ADD]] : tensor<*xf32>, tensor<2x1024xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
 }
 
 // -----
@@ -375,21 +375,21 @@ module {
 }
 
 // CHECK-LABEL: func @matmul_with_while
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<1024x1024xi8>
-// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x1024xi8>
+// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x1024xi8>) -> tensor<1024x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1024x1024xi8>) -> tensor<1024x1024xf32>
-// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) {T = [i32, i32, i32, f32, f32], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x1024xi8>) -> tensor<1024x1024xf32>
+// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) <{body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64, shape_invariant}> {T = [i32, i32, i32, f32, f32], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], device = "", output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>]} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[WHILE:.*]]) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
 // CHECK: return %[[IDENTITY]] : tensor<1x1024xf32>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.00787401571> : tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00787401571> : tensor<f32>
 
 // CHECK-LABEL: func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
-// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg3, %arg4) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg3, %arg4) <{transpose_a = false, transpose_b = false}> {device = ""} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
-// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg3, %[[IDENTITY]]) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg3, %[[IDENTITY]]) <{transpose_a = false, transpose_b = false}> {device = ""} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MATMUL_1]], %[[MATMUL_2]]) {device = ""} : (tensor<1x1024xf32>, tensor<1x1024xf32>) -> tensor<1x1024xf32>
 
 // CHECK-LABEL: func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> tensor<i1>
@@ -401,13 +401,13 @@ module {
   func.func @matmul_with_while_bf16(%arg0: tensor<1x1024xbf16>) -> tensor<1x1024xbf16> {
     %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %cst_0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %cst_1 = "tf.Const"(){value = dense<1.0> : tensor<1024x1024xbf16>} : () -> tensor<1024x1024xbf16>
+    %cst_1 = "tf.Const"() {value = dense<1.0> : tensor<1024x1024xbf16>} : () -> tensor<1024x1024xbf16>
     %0:5 = "tf.While"(%cst_0, %cst, %cst_0, %arg0, %cst_1) {T = [i32, i32, i32, bf16, bf16],_lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
     %1 = "tf.Identity"(%0#3) {device = ""} : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
     func.return %1 : tensor<1x1024xbf16>
   }
 
-  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) 
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
   {
     %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %0 = "tf.AddV2"(%arg2, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -432,20 +432,20 @@ module {
 }
 
 // CHECK-LABEL: func @matmul_with_while_bf16
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<127> : tensor<1024x1024xi8>
-// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x1024xi8>
+// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x1024xi8>) -> tensor<1024x1024xi8>
-// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1024x1024xi8>) -> tensor<1024x1024xbf16>
-// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) {T = [i32, i32, i32, bf16, bf16], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x1024xi8>) -> tensor<1024x1024xbf16>
+// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) <{body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64, shape_invariant}> {T = [i32, i32, i32, bf16, bf16], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], device = "", output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>]} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
 // CHECK: %[[ORIGIANL_IDENTITY:.*]] = "tf.Identity"(%[[WHILE:.*]]) {device = ""} : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
 
 // CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xbf16>
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<7.873530e-03> : tensor<bf16>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<7.873530e-03> : tensor<bf16>
 
 // CHECK-LABEL: func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) {
-// CHECK: %[[MATMUL_1:.*]] = "tf.XlaDotV2"(%arg3, %arg4) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+// CHECK: %[[MATMUL_1:.*]] = "tf.XlaDotV2"(%arg3, %arg4) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
 // CHECK: %[[IDENTITY_2:.*]] = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16>
-// CHECK: %[[MATMUL_2:.*]] = "tf.XlaDotV2"(%arg3, %[[IDENTITY_2]]) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+// CHECK: %[[MATMUL_2:.*]] = "tf.XlaDotV2"(%arg3, %[[IDENTITY_2]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MATMUL_1]], %[[MATMUL_2]]) {device = ""} : (tensor<1x1024xbf16>, tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
 
 // CHECK-LABEL: func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> tensor<i1> {
@@ -482,7 +482,7 @@ module {
 }
 
 // CHECK-LABEL: func @matmul_with_while_returning_mutated_value
-// CHECK-DAG: %[[W:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1024x1024xf32>} : () -> tensor<1024x1024xf32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1024x1024xf32>}> : () -> tensor<1024x1024xf32>
 
 // -----
 module {
@@ -499,27 +499,27 @@ module {
   }
 
 // CHECK-LABEL: func @multiple_quantizable_ops_in_graph
-// CHECK-DAG: %[[W_1:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x1024xi8>} : () -> tensor<2x3x3x1024xi8>
-// CHECK-DAG: %[[W_2:.*]] = "tf.Const"() {value = dense<127> : tensor<3x3x1024x1xi8>} : () -> tensor<3x3x1024x1xi8>
-// CHECK-DAG: %[[W_3:.*]] = "tf.Const"() {value = dense<127> : tensor<1024x3x4x3xi8>} : () -> tensor<1024x3x4x3xi8>
-// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[W_1:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x1024xi8>}> : () -> tensor<2x3x3x1024xi8>
+// CHECK-DAG: %[[W_2:.*]] = "tf.Const"() <{value = dense<127> : tensor<3x3x1024x1xi8>}> : () -> tensor<3x3x1024x1xi8>
+// CHECK-DAG: %[[W_3:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x3x4x3xi8>}> : () -> tensor<1024x3x4x3xi8>
+// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = ""} : () -> tensor<i32>
 // CHECK: %[[IDENTITY_1:.*]] = "tf.Identity"(%[[W_1]]) : (tensor<2x3x3x1024xi8>) -> tensor<2x3x3x1024xi8>
-// CHECK: %[[DEQUANTIZED_1:.*]] = "tf.PartitionedCall"(%[[IDENTITY_1]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform__} : (tensor<2x3x3x1024xi8>) -> tensor<2x3x3x1024xf32>
+// CHECK: %[[DEQUANTIZED_1:.*]] = "tf.PartitionedCall"(%[[IDENTITY_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform__}> : (tensor<2x3x3x1024xi8>) -> tensor<2x3x3x1024xf32>
 // CHECK: %[[IDENTITY_2:.*]] = "tf.Identity"(%[[W_2]]) : (tensor<3x3x1024x1xi8>) -> tensor<3x3x1024x1xi8>
-// CHECK: %[[DEQUANTIZED_2:.*]] = "tf.PartitionedCall"(%[[IDENTITY_2]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform_} : (tensor<3x3x1024x1xi8>) -> tensor<3x3x1024x1xf32>
+// CHECK: %[[DEQUANTIZED_2:.*]] = "tf.PartitionedCall"(%[[IDENTITY_2]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform_}> : (tensor<3x3x1024x1xi8>) -> tensor<3x3x1024x1xf32>
 // CHECK: %[[IDENTITY_3:.*]] = "tf.Identity"(%[[W_3]]) : (tensor<1024x3x4x3xi8>) -> tensor<1024x3x4x3xi8>
-// CHECK: %[[DEQUANTIZED_3:.*]] = "tf.PartitionedCall"(%[[IDENTITY_3]]) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<1024x3x4x3xi8>) -> tensor<1024x3x4x3xf32>
-// CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[DEQUANTIZED_3]], %arg0, %[[AXIS]]) {batch_dims = 0 : i64, device = ""} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
-// CHECK: %[[CONV_1:.*]] = "tf.Conv2D"(%[[GATHER]], %[[DEQUANTIZED_1]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
-// CHECK: %[[CONV_2:.*]] = "tf.Conv2D"(%[[CONV_1]], %[[DEQUANTIZED_2]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
+// CHECK: %[[DEQUANTIZED_3:.*]] = "tf.PartitionedCall"(%[[IDENTITY_3]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x3x4x3xi8>) -> tensor<1024x3x4x3xf32>
+// CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[DEQUANTIZED_3]], %arg0, %[[AXIS]]) <{batch_dims = 0 : i64}> {device = ""} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
+// CHECK: %[[CONV_1:.*]] = "tf.Conv2D"(%[[GATHER]], %[[DEQUANTIZED_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
+// CHECK: %[[CONV_2:.*]] = "tf.Conv2D"(%[[CONV_1]], %[[DEQUANTIZED_2]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
 
 // CHECK-LABEL: func private @composite_dequantize_uniform__
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.00866141729> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
 
 // CHECK-LABEL: func private @composite_dequantize_uniform_
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.00866141729> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
 
 // CHECK-LABEL: func private @composite_dequantize_uniform
-// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() {value = dense<0.00866141729> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
 
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
index 4356d084a56845..f24b6399774f08 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
@@ -23,7 +23,7 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
 // CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
 // CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
-// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) <{config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
 // CHECK-NEXT: [[res:%.+]] = "quantfork.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
 // CHECK-NEXT: "func.return"([[res]]) : (tensor<*xf32>) -> ()
 
@@ -127,10 +127,10 @@ func.func private @avgpool_after_conv(%input: tensor<1x3x4x3xf32> {tf._user_spec
 // CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
 // CHECK-SAME: (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
 // CHECK: %[[scast:.*]] = "quantfork.scast"(%[[conv]]
-// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[scast]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[scast]]) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
 // CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"(%[[fcast]])
 // CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
 // CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool_f32]])
-// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi8>
 // CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[icast]]
 // CHECK: %[[sc2:.*]] = "quantfork.scast"(%[[reshape]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir
index d5e18209291871..da78ef781b3468 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir
@@ -79,7 +79,7 @@ module attributes {tf_saved_model.semantics} {
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: %[[CST:.*]] = "tf.Const"()
   // CHECK-NEXT: %[[IDENTITY:.*]] = "tf.Identity"(%[[CST]])
-  // CHECK-NEXT: %[[VAR:.*]] = "tf.VarHandleOp"() {{{.*shared_name = "var_1".*}}}
+  // CHECK-NEXT: %[[VAR:.*]] = "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[VAR]], %[[IDENTITY]])
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
index c7fae4cf232b9f..04677b6e468828 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
@@ -57,17 +57,17 @@ module attributes {} {
   }
 
 // CHECK-LABEL: func @conv_with_bias_and_relu
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() {value = dense<0> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]>
-// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2xi32>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2xi32>
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[[[-22016, -23680]]]]>
-// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() {value = dense<[162, 160]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() <{value = dense<[162, 160]> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_4]], %[[CONST_5]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_6]], %[[CONST_0]], %[[CONST_3]], %[[CONST_1]], %[[CONST_1]], %[[CONST_2]])
 // CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x3x2x2xi32>
@@ -144,16 +144,16 @@ module attributes {} {
   }
 
 // CHECK-LABEL: func @depthwise_conv_with_bias_and_relu6
-// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x1x3xi8>} : () -> tensor<2x3x1x3xi8>
-// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() {value = dense<0> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x1x3xi32>} : () -> tensor<1x1x1x3xi32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x1x3xi8>}> : () -> tensor<2x3x1x3xi8>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<2> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x1x3xi32>}> : () -> tensor<1x1x1x3xi32>
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[[[55040, -15104, -21376]]]]>
-// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() {value = dense<[129, 166, 221]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() <{value = dense<[129, 166, 221]> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], %[[CONST_5]], %[[CONST_5]], %[[CONST_6]])
 // CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x1x3xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x2x2x3xi32>
@@ -198,14 +198,14 @@ module attributes {} {
   }
 
 // CHECK-LABEL: func @dynamic_shaped_conv2d_with_bias_and_relu6_inlined
-// CHECK-DAG: %[[filter:.*]] = "tf.Const"() {device = "", value = dense<2> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// CHECK-DAG: %[[filter:.*]] = "tf.Const"() <{value = dense<2> : tensor<2x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x2xi8>
 // CHECK-DAG: %[[input_shape:.*]] = "tf.Shape"({{.*}}) : (tensor<?x?x?x3xi8>) -> tensor<4xi32>
-// CHECK-DAG: %[[input_dim_1:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-// CHECK-DAG: %[[input_dim_2:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK-DAG: %[[input_dim_1:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK-DAG: %[[input_dim_2:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK-DAG: %[[padding_rank_1:.*]] = "tf.Concat"({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<8xi32>
 // CHECK-DAG: %[[padding_rank_2:.*]] = "tf.Reshape"(%[[padding_rank_1]], {{.*}}) : (tensor<8xi32>, tensor<2xi64>) -> tensor<4x2xi32>
 // CHECK-DAG: %[[input_padded:.*]] = "tf.PadV2"(%{{.*}}, %[[padding_rank_2]], {{.*}}) : (tensor<?x?x?x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<?x?x?x3xi8>
-// CHECK: %[[conv_output:.*]] = "tf.XlaConvV2"(%[[input_padded]], %[[filter]], {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) {batch_group_count = 1 : i64, dimension_numbers = "{{.*}}", precision_config = ""} : (tensor<?x?x?x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
+// CHECK: %[[conv_output:.*]] = "tf.XlaConvV2"(%[[input_padded]], %[[filter]], {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) <{batch_group_count = 1 : i64, dimension_numbers = "{{.*}}", precision_config = ""}> : (tensor<?x?x?x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
 // CHECK: %[[conv_output_sub:.*]] = "tf.Sub"(%[[conv_output]], {{.*}}) : (tensor<?x?x?x2xi32>, tensor<1x1x1x2xi32>) -> tensor<?x?x?x2xi32>
 // CHECK: %[[conv_output_add:.*]] = "tf.AddV2"(%[[conv_output_sub]], {{.*}}) {device = ""} : (tensor<?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x2xi32>
 }
@@ -264,7 +264,7 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @conv_with_filter_larger_than_1MB
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-264192> : tensor<1x1x1x512xi32>} : () -> tensor<1x1x1x512xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-264192> : tensor<1x1x1x512xi32>}> : () -> tensor<1x1x1x512xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]]
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST]])
@@ -297,8 +297,8 @@ module attributes {tf_saved_model.semantics} {
     return %12 : tensor<1x3xf32>
   }
 // CHECK-LABEL: func @matmul_with_relu
-// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-131072> : tensor<1x3xi32>} : () -> tensor<1x3xi32>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<1024x3xi8>}> {device = ""} : () -> tensor<1024x3xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-131072> : tensor<1x3xi32>}> : () -> tensor<1x3xi32>
 // CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"({{.*}}, %[[WEIGHT]])
 // CHECK-SAME: (tensor<1x1024xi8>, tensor<1024x3xi8>) -> tensor<1x3xi32>
 // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[MATMUL]], %[[CONST]]) : (tensor<1x3xi32>, tensor<1x3xi32>) -> tensor<1x3xi32>
@@ -479,11 +479,11 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @conv3d_with_static_shape
-// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = dense<1> : tensor<2x3x3x3x2xi8>} : () -> tensor<2x3x3x3x2xi8>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<2x3x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x3x2xi8>
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {{.*}} : () -> tensor<5x2xi32>
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [1, 1], [0, 0]]> : tensor<5x2xi32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-43> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<1x1x1x1x2xi32>} : () -> tensor<1x1x1x1x2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-43> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<-2322> : tensor<1x1x1x1x2xi32>}> : () -> tensor<1x1x1x1x2xi32>
 
 // CHECK: %[[PAD:.*]] = "tf.PadV2"({{.*}}, %[[CONST]], %[[CONST_1]])
 // CHECK: %[[CONV:.*]] = "tf.XlaConvV2"(%[[PAD]], %[[WEIGHT]]
@@ -524,9 +524,9 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @conv3d_with_dynamic_shape
-// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = dense<1> : tensor<2x3x3x3x2xi8>} : () -> tensor<2x3x3x3x2xi8>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-43> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<1x1x1x1x2xi32>} : () -> tensor<1x1x1x1x2xi32>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<2x3x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x3x2xi8>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-43> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<-2322> : tensor<1x1x1x1x2xi32>}> : () -> tensor<1x1x1x1x2xi32>
 
 // CHECK: %[[CONCAT:.*]] = "tf.Concat"({{.*}})
 // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[CONCAT]], {{.*}}) : (tensor<10xi32>, tensor<2xi64>) -> tensor<5x2xi32>
@@ -565,7 +565,7 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @batch_matmul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-131072> : tensor<20x30x1x3xi32>} : () -> tensor<20x30x1x3xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-131072> : tensor<20x30x1x3xi32>}> : () -> tensor<20x30x1x3xi32>
 // CHECK: %[[CAST:.*]] = "tf.Cast"
 // CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]]
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLADOTV2_0]], %[[CONST]]) : (tensor<20x30x64x3xi32>, tensor<20x30x1x3xi32>) -> tensor<20x30x64x3xi32>
@@ -602,7 +602,7 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @broadcasting_weight_batch_matmul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<[2, 1024, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<[2, 1024, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK: %[[CAST:.*]] = "tf.Cast"
 // CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"({{.*}}, %[[CONST]]) : (tensor<1024x3xi8>, tensor<3xi64>) -> tensor<2x1024x3xi8>
 // CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]], %[[BROADCAST_TO]])
@@ -639,8 +639,8 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @broadcasting_input_batch_matmul
-// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = {{.*}} : tensor<2x2x1024x3xi8>} : () -> tensor<2x2x1024x3xi8>
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<[2, 2, 1, 1024]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = {{.*}} : tensor<2x2x1024x3xi8>}> {device = ""} : () -> tensor<2x2x1024x3xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<[2, 2, 1, 1024]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK: %[[CAST:.*]] = "tf.Cast"
 // CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"(%[[CAST]], %[[CONST]]) : (tensor<2x1x1024xi8>, tensor<4xi64>) -> tensor<2x2x1x1024xi8>
 // CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[BROADCAST_TO]], %[[WEIGHT]])
@@ -677,14 +677,14 @@ module attributes {tf_saved_model.semantics} {
   }
 
 // CHECK-LABEL: func @dynamic_shape_batch_matmul
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() {value = dense<[1024, 3]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
-// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = {{.*}} : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
-// CHECK: %[[CAST:.*]] = "tf.Cast"({{.*}}) {Truncate = false, device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<[1024, 3]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}> : () -> tensor<0xi64>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{{{value = .* : tensor<1024x3xi8>}}}> {device = ""} : () -> tensor<1024x3xi8>
+// CHECK: %[[CAST:.*]] = "tf.Cast"({{.*}}) <{Truncate = false}> {device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
 // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%[[CAST]]) : (tensor<?x1x1024xi8>) -> tensor<3xi64>
 // CHECK: %[[SLICE_1:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST]], %[[CONST_2]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
 // CHECK: %[[SLICE_2:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST_2]], %[[CONST_1]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
index 775ab82e10501a..3c0c36684916f3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
@@ -56,7 +56,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 
 // CHECK-LABEL: func @conv_with_filter_larger_than_1GB
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-237772800> : tensor<1x1x1x512xi32>} : () -> tensor<1x1x1x512xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-237772800> : tensor<1x1x1x512xi32>}> : () -> tensor<1x1x1x512xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]]
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
index ddf33e312ab077..b7b4fa1cc39aed 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
@@ -15,7 +15,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
 // Check that variable is initialized by assigning the const value within the initializer function.
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<8xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
@@ -44,11 +44,11 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<1.000000e\+00> : tensor<8xf32>.*}}}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<1.000000e\+00> : tensor<8xf32>.*}}}>
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<2.000000e\+00> : tensor<8xf32>.*}}}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<2.000000e\+00> : tensor<8xf32>.*}}}>
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()  {{.*shared_name = "const_1".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
 
@@ -84,11 +84,11 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"]
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<8xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<8xf32>}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<8xf32>}>
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
 
@@ -123,7 +123,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<8xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<8xf32>}>
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
@@ -185,7 +185,7 @@ module attributes {tf_saved_model.semantics} {
 
 // Check that `tf.VarHandleOp` is only created for the constant that is larger
 // than the threshold (16 bytes for this test).
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<5.000000e\+00> : tensor<8xf32>.*}}}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<5.000000e\+00> : tensor<8xf32>.*}}}>
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
@@ -199,8 +199,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: @serving_default
 // CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
 // CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<5.000000e\+00> : tensor<4xf32>.*}}}
-// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {{{.*value = dense<0> : tensor<i64>.*}}}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<5.000000e\+00> : tensor<4xf32>.*}}}>
+// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{{{.*value = dense<0> : tensor<i64>.*}}}>
 // CHECK-DAG: %[[CONCAT:.*]] = "tf.ConcatV2"(%[[READ_VAR_0]], %[[CST_1]], %[[AXIS]])
 // CHECK: return %[[CONCAT]] : tensor<12xf32>
 }
@@ -214,7 +214,7 @@ module attributes {tf_saved_model.semantics} {
 
 module attributes {tf_saved_model.semantics} {
 // CHECK: func.func @init_func_restore_op()
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<8xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
 // Check that the variable's shared_name contains the fused loc's items joined
 // by the delimiter "_" and suffixed with a number.
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "apple_banana_0".*}}
@@ -247,7 +247,7 @@ module attributes {tf_saved_model.semantics} {
     %cst_2 = "tf.Const"() {value = dense<1.0> : tensor<1x5x5x1024xf32>} : () -> tensor<1x5x5x1024xf32>
     // Check that these constants are unfrozen.
     // CHECK: func private @__inference_main
-    // CHECK: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {container = "", shared_name = "const_0"} : () -> tensor<!tf_type.resource<tensor<1x5x5x1024xf32>>>
+    // CHECK: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "const_0"}> : () -> tensor<!tf_type.resource<tensor<1x5x5x1024xf32>>>
     // CHECK: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<1x5x5x1024xf32>>>) -> tensor<1x5x5x1024xf32>
     %0:3 = "tf.While"(%cst_0, %cst_1, %arg0) {T = [i32, i32, f32], _lower_using_switch_merge = true, _num_original_outputs = 4 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x5x5x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>)
     %1 = "tf.AddV2"(%0#2, %cst_2) {device = ""} : (tensor<1x5x5x1024xf32>, tensor<1x5x5x1024xf32>) -> tensor<1x5x5x1024xf32>
@@ -260,7 +260,7 @@ module attributes {tf_saved_model.semantics} {
     %cst_0 = "tf.Const"() {value = dense<1.0> : tensor<1x5x5x1024xf32>} : () -> tensor<1x5x5x1024xf32>
     // Check that these constants are remained in constants.
     // CHECK: func private @while_body
-    // CHECK-DAG:  %[[CST_0:.*]]= "tf.Const"() {value = dense<1.000000e+00> : tensor<1x5x5x1024xf32>} : () -> tensor<1x5x5x1024xf32>
+    // CHECK-DAG:  %[[CST_0:.*]]= "tf.Const"() <{value = dense<1.000000e+00> : tensor<1x5x5x1024xf32>}> : () -> tensor<1x5x5x1024xf32>
     %0 = "tf.AddV2"(%arg0, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %1 = "tf.Identity"(%0) {device = ""} : (tensor<i32>) -> tensor<i32>
     %2 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
@@ -269,13 +269,13 @@ module attributes {tf_saved_model.semantics} {
     return %1, %2, %5 : tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>
   }
 
-  func.func private @while_cond(%arg0: tensor<i32> {tf._user_specified_name = "while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "while/maximum_iterations"}, %arg2: tensor<1x5x5x1024xf32>) -> tensor<i1> 
+  func.func private @while_cond(%arg0: tensor<i32> {tf._user_specified_name = "while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "while/maximum_iterations"}, %arg2: tensor<1x5x5x1024xf32>) -> tensor<i1>
   attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x5x5x1024>], tf._original_func_name = "while_cond_60"} {
     %cst = "tf.Const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
     %cst_0 = "tf.Const"() {value = dense<5.0> : tensor<f32>} : () -> tensor<f32>
     // Check that these constants are remained in constants.
     // CHECK: func private @while_cond
-    // CHECK-DAG:  %[[CST:.*]]= "tf.Const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+    // CHECK-DAG:  %[[CST:.*]]= "tf.Const"() <{value = dense<[0, 1, 2, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
     %0 = "tf.Sum"(%arg2, %cst) {device = "", keep_dims = false} : (tensor<1x5x5x1024xf32>, tensor<4xi32>) -> tensor<f32>
     %1 = "tf.Less"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     %2 = "tf.Identity"(%1) {device = ""} : (tensor<i1>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 78d87e81c45525..01f225c9abd595 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -353,11 +353,14 @@ cc_library(
         ":attribute_utils",
         ":convert_type",
         ":dynamic_shape_utils",
+        ":tensorflow_all_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
         ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_traits",
         ":tensorflow_types",
         ":tf_arith_ops_folder",
@@ -369,6 +372,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index c3fdc7a3c83685..a5bb0051cc8fe4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -48,7 +48,6 @@ TODO: Make invariants more structured so that we can reference them in ops.
   }];
 
   let cppNamespace = "::mlir::TF";
-  let usePropertiesForAttributes = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index f9fa3d13732451..e81742e90cf56f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
 
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 5e4d546c4d47bc..e170bae81a8bcf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -312,7 +312,7 @@ def TF_YieldOp : TF_Op<"Yield",
       [Terminator,
        Pure,
        NativeOpTrait<"ReturnLike", [], "", "">,
-       ParentOneOf<["CaseRegionOp", "IfRegionOp", "WhileRegionOp"]>,
+       ParentOneOf<["CaseRegionOp", "IfRegionOp", "WhileRegionOp", "GeneratorDatasetRegionOp"]>,
        DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
            ["getMutableSuccessorOperands"]>,
       ]> {
@@ -389,6 +389,57 @@ else_branch: A region that computes the outputs of the op if cond = false.
   let hasCanonicalizer = 1;
 }
 
+def TF_GeneratorDatasetRegionOp : TF_Op<"GeneratorDatasetRegion",
+      [AttrSizedOperandSegments,
+       DeclareOpInterfaceMethods<RegionBranchOpInterface, [
+           "areTypesCompatible",
+           "getEntrySuccessorOperands",
+           "getRegionInvocationBounds",
+           "getSuccessorRegions"
+       ]>,
+       SingleBlockImplicitTerminator<"YieldOp">,
+       TF_GeneratorOpSideEffect,
+      ]> {
+  let summary = "Regional version of GeneratorDataset";
+
+  let description = [{
+Creates a dataset that invokes its 'next' region to generate elements. Conceptually,
+within MLIR, we treat this op as if it fills a buffer with all the results right away,
+and those results are then passed (through the variant tensor result) to
+MakeIterator / IteratorGetNext. Note that the actual TF implementation differs: It
+generates the next element just in time, during IteratorGetNext.
+
+init_extra_args: Additional arguments to pass to 'init'.
+next_extra_args: Additional arguments to pass to 'next'. (Passed after the
+                 normal arguments which are from the return values of 'init'.)
+finalize_extra_args: Additional arguments to pass to 'finalize'. (Passed after
+                 the normal arguments which are from the return values of 'init'.)
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$init_func_other_args,
+    Variadic<TF_Tensor>:$next_func_other_args,
+    Variadic<TF_Tensor>:$finalize_func_other_args,
+
+    ConfinedAttr<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    ConfinedAttr<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes,
+    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$metadata
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+
+  let regions = (region SizedRegion<1>:$init,
+                        SizedRegion<1>:$next,
+                        SizedRegion<1>:$finalize
+                        );
+
+  TF_DerivedOperandTypeListAttr Tinit_func_args = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedOperandTypeListAttr Tnext_func_args = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedOperandTypeListAttr Tfinalize_func_args = TF_DerivedOperandTypeListAttr<2>;
+}
+
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
       [CallOpInterface,
        DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
@@ -455,9 +506,7 @@ def TF_ParseExampleOp : TF_Op<"ParseExample",
     Variadic<TF_StrTensor>:$dense_keys,
     Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_defaults,
 
-    TF_ShapeAttrArray:$dense_shapes,
-    DenseI32ArrayAttr:$resultSegmentSizes,
-    DenseI32ArrayAttr:$operandSegmentSizes
+    TF_ShapeAttrArray:$dense_shapes
   );
 
   let results = (outs
@@ -491,8 +540,7 @@ def TF_ParseExampleV2Op : TF_Op<"ParseExampleV2",
     Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_defaults,
 
     ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_sparse,
-    TF_ShapeAttrArray:$dense_shapes,
-    DenseI32ArrayAttr:$resultSegmentSizes
+    TF_ShapeAttrArray:$dense_shapes
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 4f21118dc24b71..cee0e40f9cfeb5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -19,55 +19,60 @@ limitations under the License.
 #include <array>
 #include <cassert>
 #include <complex>
+#include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <iterator>
-#include <limits>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
 #include <type_traits>
 
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_device_helper.h"
@@ -75,12 +80,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -1708,9 +1715,10 @@ void ConstOp::build(OpBuilder& builder, OperationState& result, Type type,
 
 LogicalResult ConstOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto value = attributes.get("value");
+  ConstOpAdaptor adaptor(operands, attributes, properties, regions);
+  auto value = adaptor.getValue();
   if (!value) return emitOptionalError(location, "missing attribute 'value'");
   if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
     inferredReturnTypes.assign({elem_attr.getType()});
@@ -1951,13 +1959,13 @@ static LogicalResult inferConvReturnTypeComponents(
 
 LogicalResult Conv2DOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  Conv2DOpAdaptor op(operands.getValues(), attributes);
+  Conv2DOpAdaptor op(operands.getValues(), attributes, properties, regions);
   ArrayRef<Attribute> explicit_padding;
   ArrayAttr explicit_pad =
-      attributes.get("explicit_paddings").dyn_cast_or_null<::mlir::ArrayAttr>();
+      op.getExplicitPaddings().dyn_cast_or_null<::mlir::ArrayAttr>();
   if (!explicit_pad) {
     explicit_pad = ::mlir::Builder(context).getI64ArrayAttr({});
   }
@@ -2150,17 +2158,12 @@ StringRef Conv2DBackpropInputOp::GetOptimalLayout(
 
 LogicalResult Conv3DOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  Conv3DOpAdaptor op(operands.getValues(), attributes);
-  ArrayRef<Attribute> explicit_padding;
-  ArrayAttr explicit_pad =
-      attributes.get("explicit_paddings").dyn_cast_or_null<::mlir::ArrayAttr>();
-  if (!explicit_pad) {
-    explicit_pad = ::mlir::Builder(context).getI64ArrayAttr({});
-  }
-  explicit_padding = explicit_pad.getValue();
+  Conv3DOpAdaptor op(operands.getValues(), attributes, properties, regions);
+  ArrayAttr explicit_pad = ::mlir::Builder(context).getI64ArrayAttr({});
+  ArrayRef<Attribute> explicit_padding = explicit_pad.getValue();
 
   return inferConvReturnTypeComponents(location, op, explicit_padding,
                                        inferredReturnShapes);
@@ -2968,6 +2971,70 @@ StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices& devices) {
   return ::mlir::TF::GetOptimalLayout(devices, this);
 }
 
+//===----------------------------------------------------------------------===//
+// GeneratorDatasetRegionOp
+//===----------------------------------------------------------------------===//
+
+bool GeneratorDatasetRegionOp::areTypesCompatible(Type t1, Type t2) {
+  return true;  // Don't enforce type checking across control-flow edges.
+}
+
+void GeneratorDatasetRegionOp::getRegionInvocationBounds(
+    ArrayRef<Attribute> operands,
+    SmallVectorImpl<InvocationBounds>& invocationBounds) {
+  // We invoke `init` once, `finalize` once, and `next` any number of times.
+  invocationBounds.emplace_back(InvocationBounds(1, 1));          // init
+  invocationBounds.emplace_back(InvocationBounds::getUnknown());  // next
+  invocationBounds.emplace_back(InvocationBounds(1, 1));          // finalize
+}
+
+OperandRange GeneratorDatasetRegionOp::getEntrySuccessorOperands(
+    RegionBranchPoint point) {
+  auto end = this->getOperation()->operand_end();
+  if (point.isParent()) {
+    // The op itself doesn't branch back to itself.
+    return ::mlir::OperandRange(end, end);
+  } else if (point.getRegionOrNull() == &getInit()) {
+    return getInitFuncOtherArgs();
+  } else if (point.getRegionOrNull() == &getNext()) {
+    return getNextFuncOtherArgs();
+  } else /* finalize region */ {
+    return getFinalizeFuncOtherArgs();
+  }
+}
+
+void GeneratorDatasetRegionOp::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor>& regions) {
+  int n;
+  if (point.isParent()) {
+    // The op itself branches to `init` first.
+    regions.push_back(
+        RegionSuccessor(&getInit(), getInit().front().getArguments()));
+  } else if (point.getRegionOrNull() == &getInit()) {
+    // `init` branches to `next`, passing along the arguments given to `init`'s
+    // yield. Said arguments precede the "other args".
+    n = getInitFuncOtherArgs().size();
+    regions.push_back(RegionSuccessor(
+        &getNext(), getNext().front().getArguments().drop_back(n)));
+  } else if (point.getRegionOrNull() == &getNext()) {
+    // `next` branches to itself, or to `finalize`, passing all arguments given
+    // to `next`s yield.
+
+    // The number of values we're passing along.
+    int num = getNext().front().getTerminator()->getNumOperands();
+
+    // The number of extra values from the parent ops that should go to `next`
+    // and `finalize`.
+    regions.push_back(RegionSuccessor(
+        &getNext(), getNext().front().getArguments().slice(0, num)));
+    regions.push_back(RegionSuccessor(
+        &getFinalize(), getFinalize().front().getArguments().slice(0, num)));
+  } else {
+    // `finalize` branches back to the op itself, not passing any arguments.
+    regions.push_back(RegionSuccessor());
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // GatherV2Op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 4ac541799afd4f..01cbbb9a46967c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -18,11 +18,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <iterator>
 #include <limits>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -34,16 +35,15 @@ limitations under the License.
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -53,26 +53,28 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h"
@@ -80,14 +82,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/tensor_format.h"
 
 namespace mlir {
 namespace TF {
@@ -4389,6 +4389,12 @@ MutableOperandRange YieldOp::getMutableSuccessorOperands(
           this->getOperation(), 1,
           this->getOperation()->getOperands().size() - 1);
     }
+  } else if (auto regionOp = llvm::dyn_cast<GeneratorDatasetRegionOp>(
+                 this->getOperation()->getParentOp())) {
+    if (&regionOp.getFinalize() == this->getOperation()->getParentRegion()) {
+      // `finalize`'s returns get discarded.
+      return MutableOperandRange(this->getOperation(), 0, 0);
+    }
   }
   return MutableOperandRange(this->getOperation());
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
index 8cce823ae5233c..b6b8c41fdfaa6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
 
+#include <cstdint>
+
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/core/framework/resource_handle.h"
 
 //===----------------------------------------------------------------------===//
 // _TfrtGetResourceOp
@@ -84,6 +89,43 @@ mlir::LogicalResult PwStreamResultsOp::verify() {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// IfrtProgramCall
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult IfrtCallOp::verify() {
+  auto func = getOperation()->getParentOfType<mlir::func::FuncOp>();
+  if (func != nullptr && func->hasAttr("tfrt_ifrt_serving.program_id")) {
+    return emitOpError() << "cannot be nested inside an IFRT program";
+  }
+
+  for (mlir::Value arg : getArgs()) {
+    if (mlir::getElementTypeOrSelf(arg.getType())
+            .isa<mlir::TF::ResourceType>()) {
+      return emitOpError()
+             << "does not support passing '!tf.resource' values as arguments";
+    }
+  }
+  if (getArgs().size() != getArgNames().size()) {
+    return emitOpError()
+           << "expects every argument to have an 'arg_name' attribute";
+  }
+
+  for (mlir::Value result : getResults()) {
+    if (mlir::getElementTypeOrSelf(result.getType())
+            .isa<mlir::TF::ResourceType>()) {
+      return emitOpError()
+             << "does not support returning '!tf.resource' values as results";
+    }
+  }
+  if (getArgs().size() != getArgNames().size()) {
+    return emitOpError()
+           << "expects every result to have a 'result_name' attribute";
+  }
+
+  return mlir::success();
+}
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
index e4a41d41e0c90e..c6c3eb213d1778 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
 
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
index a0e2935255e95a..0684e188db695d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
@@ -63,6 +63,42 @@ def TF__TfrtGetResourceOp : TF_Op<"_TfrtGetResource",
   let hasVerifier = 1;
 }
 
+def TF_IfrtCallOp : TF_Op<"IfrtCall", []> {
+  let summary = "Invokes a program via IFRT on a device";
+
+  let description = [{
+    This op calls an IFRT program uniquely identified by the given program id.
+
+    During lowering from a `tf_device.cluster_func` op to a `tf.IfrtCall` op,
+    the region owned by the former will be outlined to a function with a
+    `tfrt_ifrt_serving.program_id` attribute. After that, the runtime ensures
+    that the outlined function is compiled into an executable and is available
+    for lookup from `IfrtCall` TF ops.
+
+    Ifrt program execution is encapsulated by `ServingExecutable`
+    abstraction, which takes named arguments and returns named results. Thus,
+    this op takes `arg_names` and `result_names` attributes to convert between
+    positional arguments/results of the `tf.IfrtCall` op and named ones for
+    invoking `ServingExecutable`. This op also takes `variable_names` attribute
+    to bind the variables (weights).
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor> : $args,
+    I64Attr : $program_id,
+    StrArrayAttr : $arg_names,
+    StrArrayAttr : $variable_names,
+    StrArrayAttr : $result_names
+  );
+
+  let results = (outs Variadic<TF_Tensor> : $results);
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+
+  let hasVerifier = 1;
+}
+
 // TODO(chky): Consider adding this op to tensorflow core ops.
 def TF_PwStreamResultsOp : TF_Op<"PwStreamResults"> {
   let summary = "Streams results back to the controller";
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir
index 30dd1200d0bab7..4b6a33cd657473 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir
@@ -2,42 +2,42 @@
 
 func.func @test_batch_matmul_to_einsum(%arg0: tensor<1x2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
   // CHECK-LABEL: test_batch_matmul_to_einsum
-  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x3xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  // CHECK: "tf.Einsum"(%arg0, %arg1) <{equation = "...mk,...kn->...mn"}> : (tensor<1x2x3xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x3xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   func.return %0: tensor<1x2x4xf32>
 }
 
 func.func @test_batch_matmul_broadcast_to_einsum(%arg0: tensor<2x2x4xf32>, %arg1: tensor<2x4x2xf32>) -> tensor<2x2x2xf32> {
   // CHECK-LABEL: test_batch_matmul_broadcast_to_einsum
-  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<2x2x4xf32>, tensor<2x4x2xf32>) -> tensor<2x2x2xf32>
+  // CHECK: "tf.Einsum"(%arg0, %arg1) <{equation = "...mk,...kn->...mn"}> : (tensor<2x2x4xf32>, tensor<2x4x2xf32>) -> tensor<2x2x2xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x2x4xf32>, tensor<2x4x2xf32>) -> tensor<2x2x2xf32>
   func.return %0: tensor<2x2x2xf32>
 }
 
 func.func @test_batch_matmul_dynamic_shape_both_arg_to_einsum(%arg0: tensor<1x2x?xf32>, %arg1: tensor<?x4xf32>) -> tensor<1x2x4xf32> {
   // CHECK-LABEL: test_batch_matmul_dynamic_shape_both_arg_to_einsum
-  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x?xf32>, tensor<?x4xf32>) -> tensor<1x2x4xf32>
+  // CHECK: "tf.Einsum"(%arg0, %arg1) <{equation = "...mk,...kn->...mn"}> : (tensor<1x2x?xf32>, tensor<?x4xf32>) -> tensor<1x2x4xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x?xf32>, tensor<?x4xf32>) -> tensor<1x2x4xf32>
   func.return %0: tensor<1x2x4xf32>
 }
 
 func.func @test_batch_matmul_dynamic_shape_one_arg_to_einsum(%arg0: tensor<1x2x?xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
   // CHECK-LABEL: test_batch_matmul_dynamic_shape_one_arg_to_einsum
-  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x?xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  // CHECK: "tf.Einsum"(%arg0, %arg1) <{equation = "...mk,...kn->...mn"}> : (tensor<1x2x?xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x?xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   func.return %0: tensor<1x2x4xf32>
 }
 
 func.func @test_batch_matmul_adj_to_einsum(%arg0: tensor<1x2x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x2x4xf32> {
   // CHECK-LABEL: test_batch_matmul_adj_to_einsum
-  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...nk->...mn"} : (tensor<1x2x3xf32>, tensor<4x3xf32>) -> tensor<1x2x4xf32>
+  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "...mk,...nk->...mn"}> : (tensor<1x2x3xf32>, tensor<4x3xf32>) -> tensor<1x2x4xf32>
   // CHECK: return %[[RES_EINSUM]] : tensor<1x2x4xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<1x2x3xf32>, tensor<4x3xf32>) -> tensor<1x2x4xf32>
   func.return %0: tensor<1x2x4xf32>
 }
 
 func.func @test_batch_matmulV2_adj_to_einsum(%arg0: tensor<1x3x2xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
-  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) {equation = "...km,...kn->...mn"} : (tensor<1x3x2xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "...km,...kn->...mn"}> : (tensor<1x3x2xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   // CHECK: return %[[RES_EINSUM]] : tensor<1x2x4xf32>
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false} : (tensor<1x3x2xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
   func.return %0: tensor<1x2x4xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index 64b0bfb1a6202c..e94cb5f859ec34 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -62,10 +62,10 @@ func.func @multiple_islands(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor
 // CHECK:    %[[SUB1:.*]], %[[SUB1_control:.*]] = tf_executor.island(%[[ADD2_control]]) wraps "tf.Sub"(%arg0, %arg1)
 // CHECK:    %[[MUL:.*]], %[[MUL_control:.*]] = tf_executor.island wraps "tf.Mul"(%[[SUB1]], %arg1)
 // CHECK:    %[[SUB2:.*]], %[[SUB2_control:.*]] = tf_executor.island(%[[ADD2_control]], %[[MUL_control]]) wraps "tf.Sub"(%[[ADD1]], %[[SUB1]])
-// CHECK:    %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[SUB2]]) {message = "sub result"}
+// CHECK:    %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[SUB2]]) <{message = "sub result"}>
 // CHECK:    %[[ISLAND1:.*]] = tf_executor.island(%[[ADD2_control]], %[[MUL_control]]) wraps "tf.NoOp"()
 // CHECK:    %[[ADD3:.*]], %[[ADD3_control:.*]] = tf_executor.island(%[[ISLAND1]], %[[ADD2_control]]) wraps "tf.Add"(%[[ADD2]], %[[ADD2]])
-// CHECK:    %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD3]]) {message = "add result"}
+// CHECK:    %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD3]]) <{message = "add result"}>
 // CHECK:    tf_executor.fetch %[[ADD2]], %[[MUL]], %[[PRINT1_control]], %[[PRINT2_control:.*]] :
 // CHECK:  }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
@@ -87,7 +87,7 @@ func.func @dangling_print(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*
 // CHECK:  %[[GRAPH:.*]]:2 = tf_executor.graph {
 // CHECK:    %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
 // CHECK:    %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1_control:.*]], %arg1)
-// CHECK:    %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2_control:.*]]) {message = "add result"}
+// CHECK:    %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2_control:.*]]) <{message = "add result"}>
 // CHECK:    tf_executor.fetch %[[ADD1]], %[[ADD2]], %[[PRINT_control]] :
 // CHECK:  }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
@@ -116,11 +116,11 @@ func.func @switch_and_merge(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor
 // CHECK: %[[GRAPH:.*]]:2 = tf_executor.graph {
 // CHECK:   %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
 // CHECK:   %[[LESS:.*]], %[[LESS_control:.*]] = tf_executor.island wraps "tf.Less"(%arg1, %arg1)
-// CHECK:   %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD1]]) {message = "add result 1"}
+// CHECK:   %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD1]]) <{message = "add result 1"}>
 // CHECK:   %[[ISLAND1:.*]] = tf_executor.island(%[[LESS_control]], %[[PRINT1_control]]) wraps "tf.NoOp"()
 // CHECK:   %[[SWITCH_false:.*]], %[[SWITCH_true:.*]], {{.*}} = tf_executor.Switch %[[ADD1]], %[[LESS]], %[[ISLAND1]]
 // CHECK:   %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[SWITCH_false]], %arg1)
-// CHECK:   %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) {message = "add result 2"}
+// CHECK:   %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) <{message = "add result 2"}>
 // CHECK:   %[[MERGE:.*]], %[[MERGE_index:.*]], %{{.*}} = tf_executor.Merge %[[ADD2]], %[[SWITCH_true]], %[[PRINT2_control]]
 // CHECK:   tf_executor.fetch %[[MERGE]], %[[MERGE_index]]
 // CHECK: }
@@ -141,7 +141,7 @@ func.func @control_flow_plumbing(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> te
 
 // CHECK-LABEL: func @control_flow_plumbing
 // CHECK: %[[GRAPH:.*]] = tf_executor.graph {
-// CHECK:   %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%arg0) {message = "Random Print"}
+// CHECK:   %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%arg0) <{message = "Random Print"}>
 // CHECK:   %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island(%[[PRINT_control]]) wraps "tf.Add"(%arg0, %arg1)
 // CHECK:   %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1)
 // CHECK:   tf_executor.fetch %[[ADD2]] : tensor<*xi32>
@@ -193,7 +193,7 @@ func.func @non_aliasing_reads_writes(
 // CHECK:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg0)
 // CHECK:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %arg2)
 // CHECK:   %[[READ1:.*]], %[[READ1_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg1)
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
 // CHECK:   %[[READ2:.*]], %[[READ2_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN1_CONTROL:.*]] = tf_executor.island(%[[READ1_CONTROL]]) wraps "tf.AssignVariableOp"(%arg1, %[[READ0:.*]])
 // CHECK:   %[[ASSIGN2_CONTROL:.*]] = tf_executor.island(%[[ASSIGN0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %[[READ2]])
@@ -222,8 +222,8 @@ func.func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () {
 
 // CHECK-LABEL: func @unknown_side_effecting_op
 // CHECK: tf_executor.graph {
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
-// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v1"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
+// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v1"}>
 // CHECK:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VH1]], %arg0)
 // CHECK:   %[[UNKNOWN_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]], %[[ASSIGN0_CONTROL]]) wraps "tf._UnknownSideEffectingOp_"()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 2adf88449d3853..612f01ce23ce8a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -19,29 +19,29 @@ func.func @tfAssertFalse(%arg0: tensor<1x1x6x2xf32>) {
 // CHECK-LABEL: testGatherToV2
 // Ensures that axis param and batch_dims attr use their default values of 0.
 func.func @testGatherToV2(%params: tensor<4x3xf32>, %indices: tensor<1x2xi32>) -> tensor<2x3xf32> {
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<1x2xi32>, tensor<i32>) -> tensor<2x3xf32>
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK: "tf.GatherV2"(%arg0, %arg1, %[[AXIS]]) <{batch_dims = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<1x2xi32>, tensor<i32>) -> tensor<2x3xf32>
   %0 = "tf.Gather"(%params, %indices) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<1x2xi32>) -> tensor<2x3xf32>
   func.return %0: tensor<2x3xf32>
 }
 
 // CHECK-LABEL: testBatchMatMulToV2
 func.func @testBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<2x5x7xf32>) -> tensor<2x3x7xf32> {
-  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
-  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<2x5x7xf32>) -> tensor<2x3x7xf32>
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<2x5x7xf32>) -> tensor<2x3x7xf32>
   func.return %0: tensor<2x3x7xf32>
 }
 
 // CHECK-LABEL: testDynamicBatchMatMulToV2
 func.func @testDynamicBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<?x5x7xf32>) -> tensor<2x3x7xf32> {
-  // CHECK: "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: "tf.BatchMatMul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<?x5x7xf32>) -> tensor<2x3x7xf32>
   func.return %0: tensor<2x3x7xf32>
 }
 
 // CHECK-LABEL: testBatchMatMulToMatMul
 func.func @testBatchMatMulToMatMul(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<2x2xf32> {
-  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %0
   func.return %0: tensor<2x2xf32>
@@ -49,7 +49,7 @@ func.func @testBatchMatMulToMatMul(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32
 
 // CHECK-LABEL: testBatchMatMulV2ToMatMul
 func.func @testBatchMatMulV2ToMatMul(%arg0: tensor<4x3xf32>, %arg1: tensor<4x5xf32>) -> tensor<3x5xf32> {
-  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", transpose_a = true, transpose_b = false} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
   // CHECK: return %0
   func.return %0: tensor<3x5xf32>
@@ -58,7 +58,7 @@ func.func @testBatchMatMulV2ToMatMul(%arg0: tensor<4x3xf32>, %arg1: tensor<4x5xf
 
 // CHECK-LABEL: testBiasAddV1ToBiasAdd
 func.func @testBiasAddV1ToBiasAdd(%arg0: tensor<*xf32>, %arg1: tensor<128xf32>) -> tensor<*xf32> {
-  // CHECK: "tf.BiasAdd"(%arg0, %arg1) {data_format = "NHWC", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: "tf.BiasAdd"(%arg0, %arg1) <{data_format = "NHWC"}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
   %0 = "tf.BiasAddV1"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xf32>, tensor<128xf32>) -> tensor<*xf32>
   func.return %0: tensor<*xf32>
 }
@@ -124,8 +124,8 @@ func.func @testDifferentCastType(%arg0: tensor<8x16x32x64xf32>) -> (tensor<8x16x
   %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
   func.return %0, %1: tensor<8x16x32x64xi32>, tensor<8x16x32x64xi32>
 
-  // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
-  // CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: %0 = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
+  // CHECK: %1 = "tf.Cast"(%arg0) <{Truncate = true}> : (tensor<8x16x32x64xf32>) -> tensor<8x16x32x64xi32>
   // CHECK: return %0, %1
 }
 
@@ -135,8 +135,8 @@ func.func @testCompatibleCastType(%arg0: tensor<?xf32>) -> (tensor<10xf32>, tens
   %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<?xf32>) -> tensor<10xf32>
   func.return %0, %1: tensor<10xf32>, tensor<10xf32>
 
-  // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<?xf32>) -> tensor<10xf32>
-  // CHECK: %1 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<?xf32>) -> tensor<10xf32>
+  // CHECK: %0 = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<?xf32>) -> tensor<10xf32>
+  // CHECK: %1 = "tf.Cast"(%arg0) <{Truncate = true}> : (tensor<?xf32>) -> tensor<10xf32>
   // CHECK: return %0, %1
 }
 
@@ -181,11 +181,11 @@ func.func @testConcatCwiseUnary(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>,
 func.func @testConcatCwiseBinaryOnInnerDim(%arg0: tensor<?x1xf32>,
   %arg1: tensor<?x1xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x2xf32> {
 
-  // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+  // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
 
-  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) <{axis = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
-  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) <{axis = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 
   // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[MUL_LHS_CONCAT]], %[[MUL_RHS_CONCAT]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<?x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
@@ -209,11 +209,11 @@ func.func @testConcatCwiseBinaryOnInnerDim(%arg0: tensor<?x1xf32>,
 func.func @testConcatCwiseBinaryPreserveAxisType(%arg0: tensor<?x1xf32>,
   %arg1: tensor<?x1xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x2xf32> {
 
-  // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK-DAG: %[[LHS_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}>
 
-  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[ADD_LHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) <{axis = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: %[[MUL_LHS_CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %[[LHS_AXIS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
-  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) {axis = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: %[[MUL_RHS_CONCAT:.*]] = "tf.Pack"(%arg2, %arg3) <{axis = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 
   // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[MUL_LHS_CONCAT]], %[[MUL_RHS_CONCAT]])
   // CHECK-SAME: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
@@ -287,8 +287,8 @@ func.func @testConcatCwiseBinaryNegativeAxis(%arg0: tensor<f32>,
 // Synthesize binary ops when 1 of the 3 concat inputs is a non-binary op.
 // CHECK-LABEL: testConcatCwiseBinarySynthMulOp3Inputs
 func.func @testConcatCwiseBinarySynthMulOp3Inputs(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x3xf32> {
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[CONST0:.*]] = "tf.Const"() {value = dense<[2.000000e+00, 3.000000e+00, 1.000000e+00]>
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-NEXT: %[[CONST0:.*]] = "tf.Const"() <{value = dense<[2.000000e+00, 3.000000e+00, 1.000000e+00]>
   // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2, %[[CONST]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: "tf.Mul"(%[[CONCAT]], %[[CONST0]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
@@ -303,7 +303,7 @@ func.func @testConcatCwiseBinarySynthMulOp3Inputs(%arg0: tensor<?x1xf32>, %arg1:
 
 // Similar to the above, with tf.Sub as the binary op kind.
 func.func @testConcatCwiseBinarySynthSubOp3Inputs(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x3xf32> {
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[2.000000e+00, 3.000000e+00, 0.000000e+00]>
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<[2.000000e+00, 3.000000e+00, 0.000000e+00]>
   // CHECK: %[[CONCAT:.*]] = "tf.ConcatV2"(%arg0, %arg1, %arg2,
   // CHECK: "tf.Sub"(%[[CONCAT]], %[[CONST]])
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
@@ -681,8 +681,8 @@ func.func @testTileMultiplesAllOnes(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // CHECK-LABEL: func @testStaticAndIdenticalTypeForEqualOp
 func.func @testStaticAndIdenticalTypeForEqualOp(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK:      "tf.Equal"(%arg0, %arg1)
-  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %0 = "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   func.return %0: tensor<2xi1>
 }
@@ -690,8 +690,8 @@ func.func @testStaticAndIdenticalTypeForEqualOp(%arg0: tensor<2xi32>, %arg1: ten
 // CHECK-LABEL: func @testStaticAndIdenticalTypeForNotEqualOp
 func.func @testStaticAndIdenticalTypeForNotEqualOp(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
-  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   func.return %0: tensor<2xi1>
 }
@@ -707,8 +707,8 @@ func.func @testUnknownBroadcastForNotEqualOp(%arg0: tensor<?xi32>, %arg1: tensor
 // CHECK-LABEL: func @testKnownGoodBroadcastForNotEqualOp
 func.func @testKnownGoodBroadcastForNotEqualOp(%arg0: tensor<1x?xi32>, %arg1: tensor<?x1xi32>) -> tensor<?x?xi1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
-  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME:   incompatible_shape_error = true
+  // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1x?xi32>, tensor<?x1xi32>) -> tensor<?x?xi1>
   func.return %0: tensor<?x?xi1>
 }
@@ -740,8 +740,8 @@ func.func @testUnrankedLHSForNotEqualOp(%arg0: tensor<*xi32>, %arg1: tensor<i32>
 // CHECK-LABEL: func @testScalarForNotEqualOp
 func.func @testScalarForNotEqualOp(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
   // CHECK:      "tf.NotEqual"(%arg0, %arg1)
-  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-SAME: incompatible_shape_error = true
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
@@ -752,7 +752,7 @@ func.func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32
   %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-  // CHECK: %[[NE:.*]] = "tf.NotEqual"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: %[[NE:.*]] = "tf.NotEqual"(%arg0, %arg1) <{incompatible_shape_error = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return %[[NE]]
 }
 
@@ -762,7 +762,7 @@ func.func @testLogicalNotOfNotEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16x
   %1 = "tf.LogicalNot"(%0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x16xi1>) -> tensor<8x16xi1>
   func.return %1: tensor<8x16xi1>
 
-  // CHECK: %[[NE:.*]] = "tf.Equal"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: %[[NE:.*]] = "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return %[[NE]]
 }
 
@@ -811,7 +811,7 @@ func.func @testSizeFolding(%arg0: tensor<3x5x7xf32>) -> tensor<i32> {
   %0 = "tf.Size"(%arg0) : (tensor<3x5x7xf32>) -> tensor<i32>
   func.return %0: tensor<i32>
 
-// CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<105> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<105> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: return %[[CONST]] : tensor<i32>
 }
 
@@ -873,7 +873,7 @@ func.func @testXdivyWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16x
 
 // CHECK-LABEL: @identityTranspose
 func.func @identityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x5x6xf32> {
-  %0 = "tf.Const"() {value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>} : () -> tensor<5xi32>
+  %0 = "tf.Const"() <{value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>}> : () -> tensor<5xi32>
   %1 = "tf.Transpose"(%arg0, %0) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x5x6xf32>
 
   func.return %1 : tensor<2x3x4x5x6xf32>
@@ -895,7 +895,7 @@ func.func @nonIdentityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x6x
   %1 = "tf.Transpose"(%arg0, %0) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x6x5xf32>
 
   func.return %1 : tensor<2x3x4x6x5xf32>
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK: %[[TRANS:.*]] = "tf.Transpose"(%arg0, %[[CONST]]) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x6x5xf32>
   // CHECK: return %[[TRANS]]
 }
@@ -924,8 +924,8 @@ func.func @nonCancellableTransposeCrossRegion(%arg0: tensor<1x4x4x8xf32>) -> ten
 
   func.return %result : tensor<1x4x4x8xf32>
 
-  // CHECK-DAG: %[[CONST1:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
-  // CHECK-DAG: %[[CONST2:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-DAG: %[[CONST1:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
+  // CHECK-DAG: %[[CONST2:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}>
   // CHECK: %[[TRANS1:.*]] = "tf.Transpose"(%arg0, %[[CONST1]]) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
   // CHECK: %[[TRANS2:.*]] = "tf.Transpose"(%[[TRANS1]], %[[CONST2]]) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
   // CHECK: return %[[TRANS2]]
@@ -951,8 +951,8 @@ func.func @nonCancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<4x1x4x8
 
   func.return %3 : tensor<4x1x4x8xf32>
 
-  // CHECK-DAG: %[[CONST1:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
-  // CHECK-DAG: %[[CONST2:.*]] = "tf.Const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>}
+  // CHECK-DAG: %[[CONST1:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
+  // CHECK-DAG: %[[CONST2:.*]] = "tf.Const"() <{value = dense<[2, 0, 3, 1]> : tensor<4xi32>}>
   // CHECK: %[[TRANS1:.*]] = "tf.Transpose"(%arg0, %[[CONST1]]) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
   // CHECK: %[[TRANS2:.*]] = "tf.Transpose"(%[[TRANS1]], %[[CONST2]]) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<4x1x4x8xf32>
   // CHECK: return %[[TRANS2]]
@@ -969,8 +969,8 @@ func.func @addN(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 func.func @addNWithZerosFloat(%arg0: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) {
   %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %1 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
-  // CHECK-DAG: [[ZERO:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>}
-  // CHECK-DAG: [[ONE:%.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>}
+  // CHECK-DAG: [[ZERO:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}>
+  // CHECK-DAG: [[ONE:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}>
   // CHECK: [[ADD_N:%.*]] = "tf.AddN"(%arg0, [[ZERO]], [[ONE]])
   // CHECK: return %arg0, %arg0, [[ZERO]], [[ADD_N]]
   %2 = "tf.AddN"(%arg0, %1, %1) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -984,8 +984,8 @@ func.func @addNWithZerosFloat(%arg0: tensor<2xf32>) -> (tensor<2xf32>, tensor<2x
 func.func @addNWithZerosInt(%arg0: tensor<2xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) {
   %0 = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: [[ZERO:%.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>}
-  // CHECK-DAG: [[ONE:%.*]] = "tf.Const"() {value = dense<1> : tensor<2xi32>}
+  // CHECK-DAG: [[ZERO:%.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}>
+  // CHECK-DAG: [[ONE:%.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi32>}>
   // CHECK: [[ADD_N:%.*]] = "tf.AddN"(%arg0, [[ZERO]], [[ONE]])
   // CHECK: return %arg0, %arg0, [[ZERO]], [[ADD_N]]
   %2 = "tf.AddN"(%arg0, %1, %1) : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
@@ -998,7 +998,7 @@ func.func @addNWithZerosInt(%arg0: tensor<2xi32>) -> (tensor<2xi32>, tensor<2xi3
 // CHECK-LABEL: func @addNSkipFoldingIfBroadcasting
 func.func @addNSkipFoldingIfBroadcasting(%arg0: tensor<1xf32>) -> tensor<10xf32> {
   %0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>} : () -> tensor<10xf32>
-  // CHECK: [[ZERO:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>}
+  // CHECK: [[ZERO:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<10xf32>}>
   // CHECK: [[ADD_N:%.*]] = "tf.AddN"(%arg0, [[ZERO]])
   // CHECK: return [[ADD_N]]
   %1 = "tf.AddN"(%arg0, %0) : (tensor<1xf32>, tensor<10xf32>) -> tensor<10xf32>
@@ -1014,8 +1014,8 @@ func.func @ToBool_0DScalarI1(%arg0: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_0DScalarInt
 func.func @ToBool_0DScalarInt(%arg0: tensor<i32>) -> tensor<i1> {
-  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: [[Zero:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) <{incompatible_shape_error = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return [[NE]]
   %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1023,8 +1023,8 @@ func.func @ToBool_0DScalarInt(%arg0: tensor<i32>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_0DScalarFloat
 func.func @ToBool_0DScalarFloat(%arg0: tensor<f32>) -> tensor<i1> {
-  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true}
+  // CHECK: [[Zero:%.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]]) <{incompatible_shape_error = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK: return [[NE]]
   %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1032,8 +1032,8 @@ func.func @ToBool_0DScalarFloat(%arg0: tensor<f32>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_0DScalarString
 func.func @ToBool_0DScalarString(%arg0: tensor<!tf_type.string>) -> tensor<i1> {
-  // CHECK: [[EmptyStr:%.*]] = "tf.Const"() {value = dense<""> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
-  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0", incompatible_shape_error = true} : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<i1>
+  // CHECK: [[EmptyStr:%.*]] = "tf.Const"() <{value = dense<""> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) <{incompatible_shape_error = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<i1>
   // CHECK: return [[NE]] : tensor<i1>
   %0 = "tf.ToBool"(%arg0) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.string>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1041,7 +1041,7 @@ func.func @ToBool_0DScalarString(%arg0: tensor<!tf_type.string>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_1DTensor
 func.func @ToBool_1DTensor(%arg0: tensor<1xf32>) -> tensor<i1> {
-  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[Const:%.*]] = "tf.Const"() <{value = dense<true> : tensor<i1>}> : () -> tensor<i1>
   // CHECK: return [[Const]]
   %0 = "tf.ToBool"(%arg0) : (tensor<1xf32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1049,7 +1049,7 @@ func.func @ToBool_1DTensor(%arg0: tensor<1xf32>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_1DTensorZeroDim
 func.func @ToBool_1DTensorZeroDim(%arg0: tensor<0xf32>) -> tensor<i1> {
-  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[Const:%.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
   // CHECK: return [[Const]]
   %0 = "tf.ToBool"(%arg0) : (tensor<0xf32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1057,7 +1057,7 @@ func.func @ToBool_1DTensorZeroDim(%arg0: tensor<0xf32>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_2DTensor
 func.func @ToBool_2DTensor(%arg0: tensor<1x5xf32>) -> tensor<i1> {
-  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[Const:%.*]] = "tf.Const"() <{value = dense<true> : tensor<i1>}> : () -> tensor<i1>
   // CHECK: return [[Const]]
   %0 = "tf.ToBool"(%arg0) : (tensor<1x5xf32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1065,7 +1065,7 @@ func.func @ToBool_2DTensor(%arg0: tensor<1x5xf32>) -> tensor<i1> {
 
 // CHECK-LABEL: func @ToBool_2DTensorZeroDim
 func.func @ToBool_2DTensorZeroDim(%arg0: tensor<1x0xf32>) -> tensor<i1> {
-  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[Const:%.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
   // CHECK: return [[Const]]
   %0 = "tf.ToBool"(%arg0) : (tensor<1x0xf32>) -> tensor<i1>
   func.return %0 : tensor<i1>
@@ -1098,7 +1098,7 @@ func.func @testReadVariableOpOfCastMultiUse(%arg0: tensor<!tf_type.resource<tens
   "tf.AssignVariableOp"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<f32>) -> ()
   func.return %1: tensor<f32>
 
- // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
+ // CHECK: %0 = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
  // CHECK: %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource>) -> tensor<f32>
  // CHECK: "tf.AssignVariableOp"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<f32>) -> ()
  // CHECK: return %1
@@ -1118,7 +1118,7 @@ func.func @testMultiReadVariableOpsOfCast(%arg0: tensor<!tf_type.resource<tensor
 
 // CHECK-LABEL: testRankOfRankedTensor
 func.func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
-  // CHECK:[[VAL0:%.+]] = "tf.Const"() {value = dense<3> : tensor<i32>}
+  // CHECK:[[VAL0:%.+]] = "tf.Const"() <{value = dense<3> : tensor<i32>}>
   %0 = "tf.Rank"(%arg0) : (tensor<4x3x2xf32>) -> tensor<i32>
 
   // CHECK: return [[VAL0]]
@@ -1143,16 +1143,16 @@ func.func @testRankOfRankedTensorDynamicShapeOutput(%arg0 : tensor<4x3x2xf32>) -
 func.func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>) {
   %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  // CHECK-DAG: "tf.Const"() <{value = dense<2.300000e+01> : tensor<3x2x1xf32>}>
   %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
-  // CHECK-DAG: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  // CHECK-DAG: "tf.Const"() <{value = dense<2.300000e+01> : tensor<3x2x1xf32>}>
   %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
 
   %complex_cst = "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
   // Here, custom folder doesn't handle complex dtypes and it is folded through
   // the constant folding hook.
   // TODO(hinsu): Handle complex dtypes in the custom folder for FillOp.
-  // CHECK-DAG: "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<3x2x1xcomplex<f32>>} : () -> tensor<*xcomplex<f32>>
+  // CHECK-DAG: "tf.Const"() <{value = dense<(0.000000e+00,1.000000e+00)> : tensor<3x2x1xcomplex<f32>>}> : () -> tensor<*xcomplex<f32>>
   %4 = "tf.Fill"(%0, %complex_cst) : (tensor<3xi32>, tensor<complex<f32>>) -> tensor<*xcomplex<f32>>
 
   func.return %2, %3, %4 : tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>
@@ -1164,13 +1164,13 @@ func.func @foldIf(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) ->
   %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
 
   // CHECK: %0 = "tf.PartitionedCall"(%arg0, %arg1)
-  // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @sub
+  // CHECK-SAME: device = "noodle"
   %2 = "tf.If"(%0, %arg0, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf_type.shape<>], device = "noodle", is_stateless = true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: %1 = "tf.StatefulPartitionedCall"(%0, %arg1)
+  // CHECK-SAME: f = @add
   // CHECK-SAME: _underscore_attr = "something"
   // CHECK-SAME: device = "noodle"
-  // CHECK-SAME: f = @add
   %3 = "tf.If"(%1, %2, %arg1) {then_branch = @add, else_branch = @sub, output_shapes = [#tf_type.shape<>], device = "noodle", _underscore_attr = "something", is_stateless = false} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   // CHECK: %2 = "tf.If"
@@ -1233,13 +1233,13 @@ func.func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32
 func.func @eliminatePassThroughIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf_type.resource>) -> (tensor<f32>) {
   // CHECK: %[[PRED:.*]] = "tf._SomeOp"() : () -> tensor<i1>
   %pred = "tf._SomeOp"() : () -> tensor<i1>
-  // CHECK: %[[IF_OUTPUT:.*]] = "tf.IfRegion"(%[[PRED]]) ({
+  // CHECK: %[[IF_OUTPUT:.*]] = "tf.IfRegion"(%[[PRED]]) <{is_stateless = true}> ({
   // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
   // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
   // CHECK:  },  {
   // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
   // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
-  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0", is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i1>) -> tensor<f32>
   %0:4 = "tf.IfRegion"(%pred) ({
       %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %arg2, %true_value, %arg2) : (tensor<f32>, tensor<!tf_type.resource>, tensor<f32>, tensor<!tf_type.resource>) -> ()
@@ -1260,7 +1260,7 @@ func.func @eliminatePassThroughIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>,
 func.func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf_type.resource>) -> (tensor<f32>) {
   // CHECK: %[[INDEX:.*]] = "tf._SomeOp"() : () -> tensor<i32>
   %index = "tf._SomeOp"() : () -> tensor<i32>
-  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[INDEX]]) ({
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[INDEX]]) <{is_stateless = true}> ({
   // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
   // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
   // CHECK:  },  {
@@ -1269,7 +1269,7 @@ func.func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>
   // CHECK:  },  {
   // CHECK:    %[[ADD:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
   // CHECK:    "tf.Yield"(%[[ADD]]) : (tensor<f32>)
-  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0", is_stateless = true} : (tensor<i32>) -> tensor<f32>
+  // CHECK:  }) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>) -> tensor<f32>
   %0:3 = "tf.CaseRegion"(%index) ({
       %mul = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%arg1, %mul, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf_type.resource>) -> ()
@@ -1293,13 +1293,13 @@ func.func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %3 = arith.constant dense<0> : tensor<i32>
 
   // CHECK: PartitionedCall
-  // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @add
+  // CHECK-SAME: device = "noodle"
   %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf_type.shape<>], device = "noodle", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: PartitionedCall
+  // CHECK-SAME: f = @sub
   // CHECK-SAME: _cluster_launch = "not_ready"
   // CHECK-SAME: device = "noodle"
-  // CHECK-SAME: f = @sub
   %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf_type.shape<>], device= "noodle", _cluster_launch = "not_ready", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %5 : tensor<f32>
 }
@@ -1317,7 +1317,7 @@ func.func @sub(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: testBatchToSpaceToBatchToSpaceND
 // CHECK-SAME: ([[INPUT:%.*]]: tensor<?x?x?x?xf32>, [[CROPS:%.*]]: tensor<?x?xi32>)
 func.func @testBatchToSpaceToBatchToSpaceND(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?xi32>) -> tensor<*xf32> {
-  // CHECK: [[BLOCK_SHAPE:%.*]] = "tf.Const"() {value = dense<8> : tensor<2xi64>}
+  // CHECK: [[BLOCK_SHAPE:%.*]] = "tf.Const"() <{value = dense<8> : tensor<2xi64>}>
   // CHECK: [[BATCH_TO_SHAPE_ND:%.*]] = "tf.BatchToSpaceND"([[INPUT]], [[BLOCK_SHAPE]], [[CROPS]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 8 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x?x?x?xf32>, tensor<?x?xi32>) -> tensor<*xf32>
   // CHECK: return [[BATCH_TO_SHAPE_ND]]
@@ -1612,8 +1612,8 @@ func.func private @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIfDropOutputShapes(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
   // CHECK: "tf.If"
-  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   // CHECK-NOT: output_shapes
+  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:GPU:0"
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false, output_shapes = [#tf_type.shape<>], device = "/job:localhost/replica:0/task:0/device:GPU:0"
   } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
@@ -1647,10 +1647,10 @@ func.func @testSumFoldBypass(%arg0: tensor<4x?xf16>, %arg1: tensor<*xi64>) -> te
 
 // CHECK-LABEL: @testMatrixDiag
 func.func @testMatrixDiag(%diag: tensor<2x4xf32>) -> tensor<2x4x4xf32> {
-  // CHECK-DAG: %[[MINUS1:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[ZEROI:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[ZEROF:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: "tf.MatrixDiagV3"(%arg0, %[[ZEROI]], %[[MINUS1]], %[[MINUS1]], %[[ZEROF]]) {align = "RIGHT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x4xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<f32>) -> tensor<2x4x4xf32>
+  // CHECK-DAG: %[[MINUS1:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[ZEROI:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[ZEROF:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: "tf.MatrixDiagV3"(%arg0, %[[ZEROI]], %[[MINUS1]], %[[MINUS1]], %[[ZEROF]]) <{align = "RIGHT_LEFT"}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x4xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<f32>) -> tensor<2x4x4xf32>
   %0 = "tf.MatrixDiag"(%diag) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x4xf32>) -> tensor<2x4x4xf32>
   func.return %0 : tensor<2x4x4xf32>
 }
@@ -1660,9 +1660,9 @@ func.func @testMatrixSetDiag(%arg0: tensor<3x3xi64>, %arg1: tensor<3xi64>) -> te
   %0 = "tf.MatrixSetDiag"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x3xi64>, tensor<3xi64>) -> tensor<3x3xi64>
   func.return %0 : tensor<3x3xi64>
 
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
   // CHECK: %[[RES:.*]] = "tf.MatrixSetDiagV3"(%arg0, %arg1, %[[ZERO]])
-  // CHECK-SAME: {align = "RIGHT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK-SAME: <{align = "RIGHT_LEFT"}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   // CHECK-SAME: (tensor<3x3xi64>, tensor<3xi64>, tensor<i32>) -> tensor<3x3xi64>
 }
 
@@ -1672,7 +1672,7 @@ func.func @testMatrixSetDiagV2(%arg0: tensor<3x3xi64>, %arg1: tensor<3xi64>, %ar
   func.return %0 : tensor<3x3xi64>
 
   // CHECK: %[[RES:.*]] = "tf.MatrixSetDiagV3"(%arg0, %arg1, %arg2)
-  // CHECK-SAME: {align = "LEFT_LEFT", device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK-SAME: <{align = "LEFT_LEFT"}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
 }
 
 // CHECK-LABEL: @testVariableToVariableV2
@@ -1680,7 +1680,7 @@ func.func @testVariableToVariableV2() {
   // CHECK-NOT: "tf.Variable"
 
   %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
-  // CHECK: "tf.VariableV2"() {container = "", device = "/job:localhost/replica:0/task:0/device:GPU:0", shape = #tf_type.shape<>, shared_name = "var"}
+  // CHECK: "tf.VariableV2"() <{container = "", shape = #tf_type.shape<>, shared_name = "var"}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %1 = "tf.Variable"() {container = "", dtype = i32, shared_name = "var", shape = #tf_type.shape<>, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<!tf_type.int32ref>
   %2 = "tf.Assign"(%1, %0) : (tensor<!tf_type.int32ref>, tensor<i32>) -> (tensor<!tf_type.int32ref>)
 
@@ -1691,7 +1691,7 @@ func.func @testVariableToVariableV2() {
 func.func @testUnpackAndCwiseUnary(%arg0: tensor<?x2xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
 
   // CHECK: %[[NEG:.*]] = "tf.Neg"(%arg0) {device = ""}
-  // CHECK: %[[UNPACK:.*]]:2 = "tf.Unpack"(%[[NEG]]) {axis = 1 : i64, device = ""}
+  // CHECK: %[[UNPACK:.*]]:2 = "tf.Unpack"(%[[NEG]]) <{axis = 1 : i64}> {device = ""}
   %unpacked:2 = "tf.Unpack"(%arg0) {axis = 1 : i64, device = ""}
                 : (tensor<?x2xf32>) -> (tensor<?xf32>, tensor<?xf32>)
   %0 = "tf.Neg"(%unpacked#0): (tensor<?xf32>) -> tensor<?xf32>
@@ -1708,7 +1708,7 @@ func.func @testFoldStridedSliceShapeI32(%arg0: tensor<?x1x2x?xf32>) -> (tensor<2
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %3 : tensor<2xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1719,7 +1719,7 @@ func.func @testFoldStridedSliceShapeI64(%arg0: tensor<?x1x2x?xf32>) -> (tensor<2
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi64>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
   func.return %3 : tensor<2xi64>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
   // CHECK: return %[[CST]]
 }
 
@@ -1730,7 +1730,7 @@ func.func @testFoldStridedSliceShapeDynamicOutput(%arg0: tensor<?x1x2x?xf32>) ->
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<?xi32>
   func.return %3 : tensor<?xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1741,7 +1741,7 @@ func.func @testFoldStridedSliceShapeWithShrinkAxisMaskI32(%arg0: tensor<?x1x2x?x
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
   func.return %3 : tensor<i32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: return %[[CST]]
 }
 
@@ -1752,7 +1752,7 @@ func.func @testFoldStridedSliceShapeWithShrinkAxisMaskI64(%arg0: tensor<?x1x2x?x
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi64>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi64>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i64>
   func.return %3 : tensor<i64>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK: return %[[CST]]
 }
 
@@ -1763,7 +1763,7 @@ func.func @testFoldStridedSliceShapeWithShrinkAxisMaskUnrankedOutput(%arg0: tens
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x?xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xi32>
   func.return %3 : tensor<*xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<*xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1775,7 +1775,7 @@ func.func @testFoldStridedSliceShapeWithShrinkAxisMaskNegativeBegin1(%arg0: tens
   %3 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
   func.return %4 : tensor<i32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: return %[[CST]]
 }
 
@@ -1787,7 +1787,7 @@ func.func @testFoldStridedSliceShapeWithShrinkAxisMaskNegativeBegin2(%arg0: tens
   %3 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
   func.return %4 : tensor<i32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: return %[[CST]]
 }
 
@@ -1811,7 +1811,7 @@ func.func @testFoldStridedSliceShapeWithBeginMask(%arg0: tensor<1x2x3x?xf32>) ->
   %3 = "tf.Shape"(%arg0) : (tensor<1x2x3x?xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 1 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %4 : tensor<2xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1822,7 +1822,7 @@ func.func @testFoldStridedSliceShapeWithEndMask(%arg0: tensor<?x1x2x3xf32>) -> (
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
   func.return %3 : tensor<3xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1834,7 +1834,7 @@ func.func @testFoldStridedSliceShapeWithPositiveStrides(%arg0: tensor<1x2x3x4x?x
   %3 = "tf.Shape"(%arg0) : (tensor<1x2x3x4x?xf32>) -> tensor<5xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<5xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %4 : tensor<2xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[2, 4]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1845,7 +1845,7 @@ func.func @testFoldStridedSliceShapeWithPositiveStridesOutOfBoundEnd(%arg0: tens
   %2 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %3 = "tf.StridedSlice"(%2, %1, %0, %1) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
   func.return %3 : tensor<3xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1857,7 +1857,7 @@ func.func @testFoldStridedSliceShapeWithNegativeStrides(%arg0: tensor<1x2x3x?xf3
   %3 = "tf.Shape"(%arg0) : (tensor<1x2x3x?xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   func.return %4 : tensor<1xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<3> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<3> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1869,7 +1869,7 @@ func.func @testFoldStridedSliceShapeWithNegativeStridesOutOfBoundBegin(%arg0: te
   %3 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %4 : tensor<2xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[3, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1881,7 +1881,7 @@ func.func @testFoldStridedSliceShapeWithNegativeStridesBeginMask(%arg0: tensor<?
   %3 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 1 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %4 : tensor<2xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[3, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1893,7 +1893,7 @@ func.func @testFoldStridedSliceShapeWithNegativeStridesEndMask(%arg0: tensor<1x2
   %3 = "tf.Shape"(%arg0) : (tensor<1x2x3x?xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
   func.return %4 : tensor<3xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[3, 2, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1905,7 +1905,7 @@ func.func @testFoldStridedSliceShapeWithEmptySlice(%arg0: tensor<?x1x2x3xf32>) -
   %3 = "tf.Shape"(%arg0) : (tensor<?x1x2x3xf32>) -> tensor<4xi32>
   %4 = "tf.StridedSlice"(%3, %0, %1, %2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<0xi32>
   func.return %4 : tensor<0xi32>
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -1914,7 +1914,7 @@ func.func @testFoldEnsureShapeOp(%arg0: tensor<10x20xf32>) -> (tensor<10x20xf32>
   %0 = "tf.EnsureShape"(%arg0) {shape = #tf_type.shape<10x20>} : (tensor<10x20xf32>) -> tensor<10x20xf32>
   %1 = "tf.EnsureShape"(%arg0) {shape = #tf_type.shape<?x20>} : (tensor<10x20xf32>) -> tensor<10x20xf32>
   // Failing case which should not be folded.
-  // CHECK: %[[NF:.*]] = "tf.EnsureShape"(%arg0) {shape = #tf_type.shape<20x10>}
+  // CHECK: %[[NF:.*]] = "tf.EnsureShape"(%arg0) <{shape = #tf_type.shape<20x10>}>
   %2 = "tf.EnsureShape"(%arg0) {shape = #tf_type.shape<20x10>} : (tensor<10x20xf32>) -> tensor<20x10xf32>
   // CHECK: return %arg0, %arg0, %[[NF]]
   func.return %0, %1, %2: tensor<10x20xf32>, tensor<10x20xf32>, tensor<20x10xf32>
@@ -1924,7 +1924,7 @@ func.func @testFoldEnsureShapeOp(%arg0: tensor<10x20xf32>) -> (tensor<10x20xf32>
 func.func @testConvertPackToReshapeAxis0(%arg0: tensor<2x3xf32>) -> tensor<1x2x3xf32> {
   %0 = "tf.Pack"(%arg0) {axis = 0 : i64, _xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<1x2x3xf32>
   func.return %0 : tensor<1x2x3xf32>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {_xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<1x2x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<1x2x3xf32>
 }
@@ -1933,7 +1933,7 @@ func.func @testConvertPackToReshapeAxis0(%arg0: tensor<2x3xf32>) -> tensor<1x2x3
 func.func @testConvertPackToReshapeAxis1(%arg0: tensor<2x3xf32>) -> tensor<2x1x3xf32> {
   %0 = "tf.Pack"(%arg0) {axis = 1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<2x1x3xf32>
   func.return %0 : tensor<2x1x3xf32>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[2, 1, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 1, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<2x1x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<2x1x3xf32>
 }
@@ -1942,7 +1942,7 @@ func.func @testConvertPackToReshapeAxis1(%arg0: tensor<2x3xf32>) -> tensor<2x1x3
 func.func @testDontConvertPackToReshapeDynamicShape(%arg0: tensor<2x?xf32>) -> tensor<1x2x?xf32> {
   %0 = "tf.Pack"(%arg0) {axis = 0 : i64} : (tensor<2x?xf32>) -> tensor<1x2x?xf32>
   func.return %0 : tensor<1x2x?xf32>
-  // CHECK: %[[PACK:.*]] = "tf.Pack"(%arg0) {axis = 0 : i64} : (tensor<2x?xf32>) -> tensor<1x2x?xf32>
+  // CHECK: %[[PACK:.*]] = "tf.Pack"(%arg0) <{axis = 0 : i64}> : (tensor<2x?xf32>) -> tensor<1x2x?xf32>
   // CHECK: return %[[PACK]] : tensor<1x2x?xf32>
 }
 
@@ -1950,7 +1950,7 @@ func.func @testDontConvertPackToReshapeDynamicShape(%arg0: tensor<2x?xf32>) -> t
 func.func @while_with_id_passthrough(%arg0: tensor<7xf32> {tf._user_specified_name = "x"}) -> tensor<?xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "x", outputs = "identity_RetVal"}} {
   %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<7> : tensor<1xi32>}
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<7> : tensor<1xi32>}>
   %2 = "tf.Const"() {value = dense<7> : tensor<1xi32>} : () -> tensor<1xi32>
   %3 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
   %4 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
@@ -1983,7 +1983,7 @@ func.func @while_with_id_passthrough(%arg0: tensor<7xf32> {tf._user_specified_na
 func.func @testConvertQuantizeAndDequantizeV2ToQuantizeAndDequantizeV4(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?x?xf32> {
   %0 = "tf.QuantizeAndDequantizeV2"(%arg0, %arg1, %arg2) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
-  // CHECK: %[[QUANT:.*]] = "tf.QuantizeAndDequantizeV4"(%arg0, %arg1, %arg2) {axis = -1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0", narrow_range = false, num_bits = 8 : i64, range_given = false, round_mode = "HALF_TO_EVEN", signed_input = true} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[QUANT:.*]] = "tf.QuantizeAndDequantizeV4"(%arg0, %arg1, %arg2) <{axis = -1 : i64, narrow_range = false, num_bits = 8 : i64, range_given = false, round_mode = "HALF_TO_EVEN", signed_input = true}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
   // CHECK: return %[[QUANT]] : tensor<?x?xf32>
 }
 
@@ -1991,10 +1991,10 @@ func.func @testConvertQuantizeAndDequantizeV2ToQuantizeAndDequantizeV4(%arg0 : t
 func.func @testHashTableAndInitializeTableToV2(%arg0: tensor<!tf_type.string>) {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
-  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
+  // CHECK-SAME: device = ""
   // CHECK-SAME: () -> tensor<!tf_type.resource>
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
@@ -2008,10 +2008,10 @@ func.func @testHashTableAndInitializeTableToV2(%arg0: tensor<!tf_type.string>) {
 func.func @testHashTableAndLookupTableSizeToV2() -> tensor<i64> {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
-  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
+  // CHECK-SAME: device = ""
   // CHECK-SAME: () -> tensor<!tf_type.resource>
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
@@ -2025,10 +2025,10 @@ func.func @testHashTableAndLookupTableSizeToV2() -> tensor<i64> {
 func.func @testHashTableAndLookupTableFindToV2(%arg0: tensor<!tf_type.string>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
   // CHECK-SAME: container = ""
-  // CHECK-SAME: device = ""
   // CHECK-SAME: key_dtype = !tf_type.string
   // CHECK-SAME: shared_name = "table"
   // CHECK-SAME: value_dtype = i32
+  // CHECK-SAME: device = ""
   // CHECK-SAME: () -> tensor<!tf_type.resource>
   %handle = "tf.HashTable"() {container = "", device = "", shared_name = "table", key_dtype = !tf_type.string, value_dtype = i32} : () -> tensor<*x!tf_type.stringref>
 
@@ -2041,9 +2041,9 @@ func.func @testHashTableAndLookupTableFindToV2(%arg0: tensor<!tf_type.string>, %
 // CHECK-LABEL: testDivNoNanAndMulNoNanWithConstantY
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2xf32>)
 func.func @testDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) {
-  // CHECK: %[[CON1:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
-  // CHECK-NEXT: %[[CON2:.*]] = "tf.Const"() {value = dense<[1.000000e+01, 0.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
-  // CHECK-NEXT: %[[CON3:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  // CHECK: %[[CON1:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
+  // CHECK-NEXT: %[[CON2:.*]] = "tf.Const"() <{value = dense<[1.000000e+01, 0.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
+  // CHECK-NEXT: %[[CON3:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
   // CHECK-NEXT: %[[RES1:.*]] = "tf.Div"(%[[ARG0]], %[[CON1]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   // CHECK-NEXT: %[[RES2:.*]] = "tf.MulNoNan"(%[[ARG0]], %[[CON2]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   // CHECK-NEXT: return %[[RES1]], %[[RES2]], %[[CON3]] : tensor<2xf32>, tensor<2xf32>, tensor<2xf32>
@@ -2060,9 +2060,9 @@ func.func @testDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xf32>) -> (tensor
 // CHECK-LABEL: testComplexDivNoNanAndMulNoNanWithConstantY
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2xcomplex<f32>>)
 func.func @testComplexDivNoNanAndMulNoNanWithConstantY(%arg0: tensor<2xcomplex<f32>>) -> (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) {
-  // CHECK-NEXT: %[[COMP2:.*]] = "tf.Const"() {value = dense<[(0.000000e+00,0.000000e+00), (2.000000e+00,0.000000e+00)]> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
-  // CHECK-NEXT: %[[COMP1:.*]] = "tf.Const"() {value = dense<[(1.000000e+00,3.000000e+00), (2.000000e+00,4.000000e+00)]> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
-  // CHECK-NEXT: %[[COMP3:.*]] = "tf.Const"() {value = dense<(0.000000e+00,0.000000e+00)> : tensor<2xcomplex<f32>>} : () -> tensor<2xcomplex<f32>>
+  // CHECK-NEXT: %[[COMP2:.*]] = "tf.Const"() <{value = dense<[(0.000000e+00,0.000000e+00), (2.000000e+00,0.000000e+00)]> : tensor<2xcomplex<f32>>}> : () -> tensor<2xcomplex<f32>>
+  // CHECK-NEXT: %[[COMP1:.*]] = "tf.Const"() <{value = dense<[(1.000000e+00,3.000000e+00), (2.000000e+00,4.000000e+00)]> : tensor<2xcomplex<f32>>}> : () -> tensor<2xcomplex<f32>>
+  // CHECK-NEXT: %[[COMP3:.*]] = "tf.Const"() <{value = dense<(0.000000e+00,0.000000e+00)> : tensor<2xcomplex<f32>>}> : () -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[RES1:.*]] = "tf.Mul"(%[[ARG0]], %[[COMP1]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[RES2:.*]] = "tf.DivNoNan"(%[[ARG0]], %[[COMP2]]) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: return %[[RES1]], %[[RES2]], %[[COMP3]] : tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>
@@ -2104,7 +2104,7 @@ func.func @testDivNoNanAndMulNoNanWithNonConstantY(%arg0: tensor<2xf32>, %arg1:
 // CHECK-LABEL: testComplexDivNoNanOpWithNonConstantY
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2xcomplex<f32>>, %[[ARG1:.*]]: tensor<2xcomplex<f32>>, %[[ARG2:.*]]: tensor<2xf32>)
 func.func @testComplexDivNoNanOpWithNonConstantY(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f32>>, %arg2: tensor<2xf32>) -> (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) {
-  // CHECK: %[[CON1:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
+  // CHECK: %[[CON1:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
   // CHECK-NEXT: %[[NONCON2:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]]) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[NONCON3:.*]] = "tf.Complex"(%[[CON1]], %[[ARG2]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xcomplex<f32>>
   // CHECK-NEXT: %[[RES1:.*]] = "tf.MulNoNan"(%[[ARG0]], %[[ARG1]]) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>
@@ -2128,7 +2128,7 @@ func.func @testXlaConvToV2(%lhs: tensor<8x4x16x16x16xf32>, %rhs: tensor<4x3x3x16
   %rhs_dilation = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
   %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
   %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_0, %cst_1, %cst) {batch_group_count = 1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0", dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+  // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_0, %cst_1, %cst) <{batch_group_count = 1 : i64, dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
   %0 = "tf.XlaConv"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = "", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
   func.return %0 : tensor<8x4x14x14x16xf32>
 }
@@ -2136,7 +2136,7 @@ func.func @testXlaConvToV2(%lhs: tensor<8x4x16x16x16xf32>, %rhs: tensor<4x3x3x16
 
 // CHECK-LABEL: testXlaReduceToXlaVariadicReduceV2
 func.func @testXlaReduceToXlaVariadicReduceV2(%arg0: tensor<*xbf16>, %arg1: tensor<*xbf16>) -> tensor<*xbf16> {
-  // CHECK: "tf.XlaVariadicReduceV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum1} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
+  // CHECK: "tf.XlaVariadicReduceV2"(%arg0, %arg1) <{dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum1}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
   %0 = "tf.XlaReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum1, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<*xbf16>, tensor<*xbf16>) -> tensor<*xbf16>
   func.return %0 : tensor<*xbf16>
 }
@@ -2148,7 +2148,7 @@ func.func private @sum1(%arg0: tensor<*xbf16>, %arg1: tensor<*xbf16>) -> tensor<
 
 // CHECK-LABEL: testXlaVariadicReduceToV2
 func.func @testXlaVariadicReduceToV2(%arg0: tensor<3x4xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK:  "tf.XlaVariadicReduceV2"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum2} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK:  "tf.XlaVariadicReduceV2"(%arg0, %arg1) <{dimensions_to_reduce = [], operandSegmentSizes = array<i32: 1, 1>, reducer = @sum2}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
   %0 = "tf.XlaVariadicReduce"(%arg0, %arg1) {dimensions_to_reduce = [], reducer = @sum2, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
@@ -2229,8 +2229,8 @@ func.func @testTensorListGetItem(%arg0: tensor<1600x1x32xf32>, %arg1: tensor<2xi
   %1 = "tf.TensorListGetItem"(%0, %arg2, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
   func.return %1 : tensor<1x32xf32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[RES:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK: %[[RES:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) <{batch_dims = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
 }
 
 // CHECK-LABEL: testTensorListGetItemMultipleUsers
@@ -2240,9 +2240,9 @@ func.func @testTensorListGetItemMultipleUsers(%arg0: tensor<1600x1x32xf32>, %arg
   %2 = "tf.TensorListGetItem"(%0, %arg3, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.variant<tensor<1x32xf32>>>, tensor<i32>, tensor<2xi32>) -> tensor<1x32xf32>
   func.return %1, %2 : tensor<1x32xf32>, tensor<1x32xf32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[RES0:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
-  // CHECK: %[[RES1:.*]] = "tf.GatherV2"(%arg0, %arg3, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK: %[[RES0:.*]] = "tf.GatherV2"(%arg0, %arg2, %cst) <{batch_dims = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
+  // CHECK: %[[RES1:.*]] = "tf.GatherV2"(%arg0, %arg3, %cst) <{batch_dims = 0 : i64}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<1600x1x32xf32>, tensor<i32>, tensor<i32>) -> tensor<1x32xf32>
 }
 
 // CHECK-LABEL: testUnaryIdempotent
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize_compile_and_replicate_attributes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize_compile_and_replicate_attributes.mlir
index 9f6dc7b017fb90..224395853c0901 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize_compile_and_replicate_attributes.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize_compile_and_replicate_attributes.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: func.func @convert_tpu_replicate
 func.func @convert_tpu_replicate() {
   tf_executor.graph {
-    // CHECK: tf_executor.island wraps "tf.TPUReplicateMetadata"() {_replication_info = "cluster", _xla_compile_device_type = "TPU", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
+    // CHECK: tf_executor.island wraps "tf.TPUReplicateMetadata"() <{allow_soft_placement = false, computation_shape = [], device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true}> {_replication_info = "cluster", _xla_compile_device_type = "TPU", device = "", name = "TPUReplicateMetadata"} : () -> ()
     %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true, use_spmd_for_xla_partitioning = false} : () -> ()
     %outputs_0, %control_0 = tf_executor.island wraps "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "y", shape = "tfshape$dim { }"} : () -> tensor<0xf32>
     %outputs_1, %control_1 = tf_executor.island wraps "tf.TPUReplicatedInput"(%outputs_0) {N = 1 : i64, T = "tfdtype$DT_FLOAT", device = "", name = "input1"} : (tensor<0xf32>) -> tensor<0xf32>
@@ -21,7 +21,7 @@ func.func @convert_tpu_replicate() {
 
 // CHECK-LABEL: func.func @convert_xla_must_compile
 func.func @convert_xla_must_compile(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> tensor<i32>
+  // CHECK: "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @stateful_pcall_func}> {_xla_compile_device_type = "CPU", device = "/device:CPU:0"} : (tensor<i32>) -> tensor<i32>
   %0 = "tf.StatefulPartitionedCall"(%arg0) {_XlaMustCompile = true, config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
   func.return %0 : tensor<i32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_tf_ops_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_tf_ops_pass.mlir
index 8d94a0778b839d..85f9c47058ecf5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_tf_ops_pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_tf_ops_pass.mlir
@@ -43,7 +43,7 @@ func.func @while_cond(%arg0: tensor<i32> {tf.device = "/job:localhost/replica:0/
 // CHECK: func @while_body(%[[ARG_0:.*]]: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"})
 // CHECK-NEXT:   %[[RESULT_0:.*]] = "tf.Const"()
 // CHECK-NEXT:   %[[RESULT_1:.*]] = "tf.AddV2"(%[[ARG_0]], %[[RESULT_0]])
-// CHECK-NEXT:   %[[RESULT_2:.*]] = "tf.Const"() {value = dense<16> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:   %[[RESULT_2:.*]] = "tf.Const"() <{value = dense<16> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:   tf_device.send %[[RESULT_2]] "key-0" "/job:worker/replica:0/task:1/device:CPU:0"
 // CHECK-SAME:  device = "/job:localhost/replica:0/task:0/device:CPU:0"
 // CHECK-NEXT:   tf_device.remote_run "/job:worker/replica:0/task:1" @[[BODY_PARTITION_0:.*]]() : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 3231f05133c48f..be0ab858484acd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -9,7 +9,7 @@ func.func @testShape(tensor<f32>, tensor<1x32x32x16xf32>, tensor<*xf32>) -> (ten
 
   // Result shape need not be static. Folding harness uses TensorFlow constant
   // in that case.
-  // CHECK-DAG: "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK-DAG: "tf.Const"() <{value = dense<[1, 32, 32, 16]> : tensor<4xi32>}> : () -> tensor<?xi32>
   %1 = "tf.Shape"(%arg1) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<1x32x32x16xf32>) -> tensor<?xi32>
 
   // CHECK: "tf.Shape"(%arg2) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<?xi32>
@@ -28,7 +28,7 @@ func.func @testPow(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>
   // CHECK-DAG: %[[RES_NO_FOLD:.*]] = "tf.Pow"(%arg0, %arg1)
   %0 = "tf.Pow"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
-  // CHECK-DAG: %[[POW_ZERO:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
+  // CHECK-DAG: %[[POW_ZERO:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<4xf32>}> : () -> tensor<4xf32>
   %1 = "tf.Pow"(%arg0, %cst_zero) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
 
   // CHECK-NOT: "tf.Pow"
@@ -42,7 +42,7 @@ func.func @testPow(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>
 func.func @testEmpty32() -> (tensor<5xi32>) {
   %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: [[VAL:%.+]] = "tf.Const"() {value = dense<0> : tensor<5xi32>}
+  // CHECK: [[VAL:%.+]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}>
   // CHECK: return [[VAL]]
   %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xi32>)
   func.return %1 : tensor<5xi32>
@@ -52,7 +52,7 @@ func.func @testEmpty32() -> (tensor<5xi32>) {
 func.func @testEmpty64() -> (tensor<5xi64>) {
   %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: [[VAL:%.+]] = "tf.Const"() {value = dense<0> : tensor<5xi64>}
+  // CHECK: [[VAL:%.+]] = "tf.Const"() <{value = dense<0> : tensor<5xi64>}>
   // CHECK: return [[VAL]] : tensor<5xi64>
   %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xi64>)
   func.return %1 : tensor<5xi64>
@@ -62,7 +62,7 @@ func.func @testEmpty64() -> (tensor<5xi64>) {
 func.func @testEmptyFloat() -> (tensor<5xf64>) {
   %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xf64>}
+  // CHECK: [[VAL:%.+]] = "tf.Const"() <{value =  dense<0.000000e+00> : tensor<5xf64>}>
   // CHECK: return [[VAL]]
   %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xf64>)
   func.return %1 : tensor<5xf64>
@@ -72,7 +72,7 @@ func.func @testEmptyFloat() -> (tensor<5xf64>) {
 func.func @testEmptyf16() -> (tensor<5xf16>) {
   %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xf16>}
+  // CHECK: [[VAL:%.+]] = "tf.Const"() <{value =  dense<0.000000e+00> : tensor<5xf16>}>
   // CHECK: return [[VAL]]
   %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xf16>)
   func.return %1 : tensor<5xf16>
@@ -82,7 +82,7 @@ func.func @testEmptyf16() -> (tensor<5xf16>) {
 func.func @testEmptybf16() -> (tensor<5xbf16>) {
   %0 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: [[VAL:%.+]] = "tf.Const"() {value =  dense<0.000000e+00> : tensor<5xbf16>}
+  // CHECK: [[VAL:%.+]] = "tf.Const"() <{value =  dense<0.000000e+00> : tensor<5xbf16>}>
   // CHECK: return [[VAL]]
   %1 = "tf.Empty"(%0) : (tensor<i32>) -> (tensor<5xbf16>)
   func.return %1 : tensor<5xbf16>
@@ -91,8 +91,8 @@ func.func @testEmptybf16() -> (tensor<5xbf16>) {
 // CHECK-LABEL: func @testShapeN
 func.func @testShapeN(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>) -> (tensor<0xi64>, tensor<4xi64>) {
 
-  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
-  // CHECK-DAG: %[[SHAPE1:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}>
+  // CHECK-DAG: %[[SHAPE1:.*]] = "tf.Const"() <{value = dense<[1, 32, 32, 16]> : tensor<4xi64>}>
   %0:2 = "tf.ShapeN"(%arg0, %arg1) : (tensor<f32>, tensor<1x32x32x16xf32>) -> (tensor<0xi64>, tensor<4xi64>)
 
   // CHECK: return %[[SHAPE0]], %[[SHAPE1]]
@@ -101,8 +101,8 @@ func.func @testShapeN(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>) -> (ten
 
 // CHECK-LABEL: func @testShapeNPartialStatic
 func.func @testShapeNPartialStatic(%arg0: tensor<f32>, %arg1: tensor<2x?x3xf32>, %arg2: tensor<1x32x32x16xf32>, %arg3: tensor<*xf32>) -> (tensor<0xi64>, tensor<3xi64>, tensor<4xi64>, tensor<?xi64>) {
-  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
-  // CHECK-DAG: %[[SHAPE2:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}>
+  // CHECK-DAG: %[[SHAPE2:.*]] = "tf.Const"() <{value = dense<[1, 32, 32, 16]> : tensor<4xi64>}>
   // CHECK: %[[SHAPE13:.*]]:2 = "tf.ShapeN"(%arg1, %arg3) : (tensor<2x?x3xf32>, tensor<*xf32>) -> (tensor<3xi64>, tensor<?xi64>)
   %0:4 = "tf.ShapeN"(%arg0, %arg1, %arg2, %arg3) : (tensor<f32>, tensor<2x?x3xf32>, tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<0xi64>, tensor<3xi64>, tensor<4xi64>, tensor<?xi64>)
 
@@ -112,8 +112,8 @@ func.func @testShapeNPartialStatic(%arg0: tensor<f32>, %arg1: tensor<2x?x3xf32>,
 
 // CHECK-LABEL: func @testShapeNOneDynamic
 func.func @testShapeNOneDynamic(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>, %arg2: tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<?xi64>) {
-  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
-  // CHECK-DAG: %[[SHAPE1:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK-DAG: %[[SHAPE0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}>
+  // CHECK-DAG: %[[SHAPE1:.*]] = "tf.Const"() <{value = dense<[1, 32, 32, 16]> : tensor<4xi64>}>
   // CHECK: %[[SHAPE2:.*]] = "tf.Shape"(%arg2) : (tensor<*xf32>) -> tensor<?xi64>
   %0:3 = "tf.ShapeN"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<?xi64>)
 
@@ -140,8 +140,8 @@ func.func @testLeakyRelu(%arg0 : tensor<16xf32>) -> (tensor<16xf32>, tensor<f32>
   %2 = "tf.LeakyRelu"(%arg0) {alpha = 3.0 : f32} : (tensor<16xf32>) -> tensor<16xf32>
   // CHECK-DAG: [[POS:%.*]] = "tf.Const{{.*}} dense<5.000000e+00> : tensor<f32>
   // CHECK-DAG: [[NEG:%.*]] = "tf.Const{{.*}} dense<-1.000000e+00> : tensor<f32>
-  // CHECK: [[NC1:%.*]] = "tf.LeakyRelu"(%arg0) {alpha = 2.000000e-01 : f32} : (tensor<16xf32>) -> tensor<16xf32>
-  // CHECK: [[NC2:%.*]] = "tf.LeakyRelu"(%arg0) {alpha = 3.000000e+00 : f32} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK: [[NC1:%.*]] = "tf.LeakyRelu"(%arg0) <{alpha = 2.000000e-01 : f32}> : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK: [[NC2:%.*]] = "tf.LeakyRelu"(%arg0) <{alpha = 3.000000e+00 : f32}> : (tensor<16xf32>) -> tensor<16xf32>
   // CHECK: return [[NC1]], [[POS]], [[NEG]], [[NC2]]
   func.return %no, %0, %1, %2 : tensor<16xf32>, tensor<f32>, tensor<f32>, tensor<16xf32>
 }
@@ -295,8 +295,8 @@ func.func @testUnimplementedOp() -> (tensor<i32>, tensor<i32>) {
   %3 = "tf.Minimum"(%0, %1) {random_attr = "hello"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %2, %3: tensor<i32>, tensor<i32>
 
-// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT: return %[[CST]], %[[CST1]]
 }
 
@@ -454,7 +454,7 @@ func.func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf3
   %1 = "tf.Mul"(%arg0, %0) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
   func.return %1 : tensor<1x6x8x1xf32>
   // CHECK-LABEL: DontRemoveTrivialMul
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[RESULT:.*]] = "tf.Mul"(%arg0, %[[CONST]]) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
   // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
 }
@@ -517,7 +517,7 @@ func.func @testBroadcastGradientArgsSameShape() -> (tensor<0xi32>, tensor<0xi32>
   %s1 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
   %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<2xi32>, tensor<2xi32>) -> (tensor<0xi32>, tensor<0xi32>)
 
-  // CHECK-DAG: %[[R:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-DAG: %[[R:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R]], %[[R]]
 
@@ -529,8 +529,8 @@ func.func @testBroadcastGradientArgs1() -> (tensor<1xi32>, tensor<0xi32>) {
   %s0 = "tf.Const"() {value = dense<[4]> : tensor<1xi32>} : () -> tensor<1xi32>
   %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
   %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<1xi32>, tensor<2xi32>) -> (tensor<1xi32>, tensor<0xi32>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -542,8 +542,8 @@ func.func @testBroadcastGradientArgs2() -> (tensor<1xi32>, tensor<3xi32>) {
   %s2 = "tf.Const"() {value = dense<[501, 1, 32, 1280]> : tensor<4xi32>} : () -> tensor<4xi32>
   %s3 = "tf.Const"() {value = dense<[  1, 1,  1, 1280]> : tensor<4xi32>} : () -> tensor<4xi32>
   %r2, %r3 = "tf.BroadcastGradientArgs"(%s2, %s3) {} : (tensor<4xi32>, tensor<4xi32>) -> (tensor<1xi32>, tensor<3xi32>)
-  // CHECK-DAG: %[[R2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[R3:.*]] = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-DAG: %[[R2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[R3:.*]] = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R2]], %[[R3]]
 
@@ -555,7 +555,7 @@ func.func @testBroadcastGradientArgs3() -> (tensor<3xi32>, tensor<3xi32>) {
   %s4 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
   %s5 = "tf.Const"() {value = dense<[1, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %r4, %r5 = "tf.BroadcastGradientArgs"(%s4, %s5) {} : (tensor<0xi32>, tensor<3xi32>) -> (tensor<3xi32>, tensor<3xi32>)
-  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[R0:.*]] = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R0]]
 
@@ -567,8 +567,8 @@ func.func @testBroadcastGradientArgs4() -> (tensor<2xi32>, tensor<3xi32>) {
   %s4 = "tf.Const"() {value = dense<[1, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %s5 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
   %r4, %r5 = "tf.BroadcastGradientArgs"(%s4, %s5) {} : (tensor<3xi32>, tensor<0xi32>) -> (tensor<2xi32>, tensor<3xi32>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -580,7 +580,7 @@ func.func @testBroadcastGradientArgs5() -> (tensor<1xi32>, tensor<1xi32>) {
   %s4 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
   %s5 = "tf.Const"() {value = dense<[1]> : tensor<1xi32>} : () -> tensor<1xi32>
   %r4, %r5 = "tf.BroadcastGradientArgs"(%s4, %s5) {} : (tensor<0xi32>, tensor<1xi32>) -> (tensor<1xi32>, tensor<1xi32>)
-  // CHECK: %[[R0:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[R0:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R0]]
 
@@ -592,8 +592,8 @@ func.func @testBroadcastGradientArgs6() -> (tensor<1xi32>, tensor<0xi32>) {
   %s4 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
   %s5 = "tf.Const"() {value = dense<[2]> : tensor<1xi32>} : () -> tensor<1xi32>
   %r4, %r5 = "tf.BroadcastGradientArgs"(%s4, %s5) {} : (tensor<0xi32>, tensor<1xi32>) -> (tensor<1xi32>, tensor<0xi32>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -605,8 +605,8 @@ func.func @testBroadcastGradientArgsHigherRank() -> (tensor<2xi32>, tensor<2xi32
   %s0 = "tf.Const"() {value = dense<[1, 4, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %s1 = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
   %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<3xi32>, tensor<2xi32>) -> (tensor<2xi32>, tensor<2xi32>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<[0, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -618,8 +618,8 @@ func.func @testBroadcastGradientArgsScalar() -> (tensor<2xi32>, tensor<0xi32>) {
   %s0 = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
   %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<0xi32>, tensor<2xi32>) -> (tensor<2xi32>, tensor<0xi32>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -631,8 +631,8 @@ func.func @testBroadcastGradientArgI64() -> (tensor<2xi64>, tensor<0xi64>) {
   %s0 = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
   %s1 = "tf.Const"() {value = dense<[2, 4]> : tensor<2xi64>} : () -> tensor<2xi64>
   %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) {} : (tensor<0xi64>, tensor<2xi64>) -> (tensor<2xi64>, tensor<0xi64>)
-  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+  // CHECK-DAG: %[[R0:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // CHECK-DAG: %[[R1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}> : () -> tensor<0xi64>
   // CHECK-NOT: tf.BroadcastGradientArgs
   // CHECK: return %[[R0]], %[[R1]]
 
@@ -643,7 +643,7 @@ func.func @testBroadcastGradientArgI64() -> (tensor<2xi64>, tensor<0xi64>) {
 func.func @testEmptyResults(%arg0: tensor<0x2xf32>) -> tensor<0x2xf32> {
   %indices = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
 
-  // CHECK: "tf.Const"() {value = dense<> : tensor<0x2xf32>} : () -> tensor<0x2xf32>
+  // CHECK: "tf.Const"() <{value = dense<> : tensor<0x2xf32>}> : () -> tensor<0x2xf32>
   %0 = "tf.DynamicStitch"(%indices, %arg0) : (tensor<0xi32>, tensor<0x2xf32>) -> tensor<0x2xf32>
   func.return %0 : tensor<0x2xf32>
 }
@@ -668,7 +668,7 @@ func.func @range_int() -> tensor<?xi32> {
   %cst_1 = arith.constant dense<4> : tensor<i32>
   %cst_2 = arith.constant dense<1> : tensor<i32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 1, 2, 3]> : tensor<4xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
   %0 = "tf.Range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
@@ -680,7 +680,7 @@ func.func @range_uint() -> tensor<?xui32> {
   %cst_1 = arith.constant dense<4> : tensor<ui32>
   %cst_2 = arith.constant dense<1> : tensor<ui32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 3]> : tensor<4xui32>} : () -> tensor<?xui32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 1, 2, 3]> : tensor<4xui32>}> : () -> tensor<?xui32>
   // CHECK: return %[[CST]]
   %0 = "tf.Range"(%cst, %cst_1, %cst_2) : (tensor<ui32>, tensor<ui32>, tensor<ui32>) -> tensor<?xui32>
   func.return %0 : tensor<?xui32>
@@ -692,7 +692,7 @@ func.func @range_float() -> tensor<?xf32> {
   %cst_1 = arith.constant dense<4.0> : tensor<f32>
   %cst_2 = arith.constant dense<1.0> : tensor<f32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}> : () -> tensor<?xf32>
   // CHECK: return %[[CST]]
   %0 = "tf.Range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -700,7 +700,7 @@ func.func @range_float() -> tensor<?xf32> {
 
 // CHECK-LABEL: func @testLogicalAndFoldsWithConstantFalse
 func.func @testLogicalAndFoldsWithConstantFalse(%arg0: tensor<i1>) -> (tensor<i1>) {
-  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
   %cst = arith.constant dense<false> : tensor<i1>
 
   %0 = "tf.LogicalAnd"(%cst, %arg0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
@@ -711,7 +711,7 @@ func.func @testLogicalAndFoldsWithConstantFalse(%arg0: tensor<i1>) -> (tensor<i1
 
 // CHECK-LABEL: func @testLogicalAndFoldsWithConstantFalseSecondArg
 func.func @testLogicalAndFoldsWithConstantFalseSecondArg(%arg0: tensor<i1>) -> (tensor<i1>) {
-  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
   %cst = arith.constant dense<false> : tensor<i1>
 
   %0 = "tf.LogicalAnd"(%arg0, %cst) : (tensor<i1>, tensor<i1>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant_op_device_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant_op_device_assignment.mlir
index 5d890aa8898274..2e326474a82527 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant_op_device_assignment.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant_op_device_assignment.mlir
@@ -2,8 +2,8 @@
 
 // CHECK: func @replace_const_op_test
 func.func @replace_const_op_test() {
-  // CHECK-NEXT: %[[RESULT_0:.*]] = "tf.Const"() {device = "/job:worker/replica:0/task:0/device:CPU:1", value = dense<2.000000e+00> : tensor<f32>}
-  // CHECK-NEXT: %[[RESULT_1:.*]] = "tf.Const"() {device = "/job:worker/replica:0/task:0/device:CPU:0", value = dense<2.000000e+00> : tensor<f32>}
+  // CHECK-NEXT: %[[RESULT_0:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> {device = "/job:worker/replica:0/task:0/device:CPU:1"}
+  // CHECK-NEXT: %[[RESULT_1:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> {device = "/job:worker/replica:0/task:0/device:CPU:0"}
   // CHECK-NEXT: %[[RESULT_2:.*]] = "tf.AddV2"(%[[RESULT_1]], %[[RESULT_1]]) {device = "/job:worker/replica:0/task:0/device:CPU:0"}
   // CHECK-NEXT: %[[RESULT_3:.*]] = "tf.AddV2"(%[[RESULT_0]], %[[RESULT_0]]) {device = "/job:worker/replica:0/task:0/device:CPU:1"}
   %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
@@ -14,7 +14,7 @@ func.func @replace_const_op_test() {
 
 // CHECK: func @no_change_test
 func.func @no_change_test() -> ()  {
-  // CHECK-NEXT: %[[RESULT_0:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT: %[[RESULT_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT: %[[RESULT_1:.*]] = "tf.AddV2"(%[[RESULT_0]], %[[RESULT_0]]) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
   %1 = "tf.AddV2"(%0, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert-tf-control-flow-to-scf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert-tf-control-flow-to-scf.mlir
index 77137819da6485..6368981ba13be5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert-tf-control-flow-to-scf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert-tf-control-flow-to-scf.mlir
@@ -20,11 +20,11 @@ func.func @test_supported_lowering_of_tf_if_region1(%arg0: tensor<i1>, %arg1: te
   // CHECK-NEXT: %[[RES:.*]]:2 = scf.if %[[COND]] -> (tensor<*xf32>, tensor<4xf32>) {
   // CHECK-NEXT:   %[[CALL:.*]] = func.call @test_if_then1(%[[ARG1]]) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT:   %[[ADD:.*]] = "tf.AddV2"(%[[CALL]], %[[CALL]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK-NEXT:   %[[CAST:.*]] = "tf.Cast"(%[[CALL]]) {Truncate = false} : (tensor<4xf32>) -> tensor<*xf32>
+  // CHECK-NEXT:   %[[CAST:.*]] = "tf.Cast"(%[[CALL]]) <{Truncate = false}> : (tensor<4xf32>) -> tensor<*xf32>
   // CHECK-NEXT:   scf.yield %[[CAST]], %[[ADD]] : tensor<*xf32>, tensor<4xf32>
   // CHECK-NEXT: } else {
   // CHECK-NEXT:   %[[CALL_0:.*]] = func.call @test_if_else1(%[[ARG1]]) : (tensor<4xf32>) -> tensor<4xf32>
-  // CHECK-NEXT:   %[[CAST_0:.*]] = "tf.Cast"(%[[CALL_0]]) {Truncate = false} : (tensor<4xf32>) -> tensor<*xf32>
+  // CHECK-NEXT:   %[[CAST_0:.*]] = "tf.Cast"(%[[CALL_0]]) <{Truncate = false}> : (tensor<4xf32>) -> tensor<*xf32>
   // CHECK-NEXT:   scf.yield %[[CAST_0]], %[[CALL_0]] : tensor<*xf32>, tensor<4xf32>
   // CHECK-NEXT: }
   // CHECK-NEXT: return %[[RES]]#0, %[[RES]]#1 : tensor<*xf32>, tensor<4xf32>
@@ -72,7 +72,7 @@ func.func @test_supported_lowering_of_tf_while_region(%arg0: tensor<f32>, %arg1:
   }) {is_stateless = false} : (tensor<f32>, tensor<*xf32>) -> (tensor<f32>, tensor<*xf32>)
   func.return %0#0 : tensor<f32>
 
-  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-NEXT: %[[RES:.*]]:2 = scf.while (%[[ARG3:.*]] = %[[ARG0]], %[[ARG4:.*]] = %[[ARG2]]) : (tensor<f32>, tensor<*xf32>) -> (tensor<f32>, tensor<*xf32>) {
   // CHECK-NEXT:   %[[IDEN:.*]] = "tf.Identity"(%[[ARG3]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:   %[[ADD:.*]] = "tf.Add"(%[[ARG1]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
index 4968054455f658..d473c1a7d7b67e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
@@ -49,7 +49,7 @@ func.func @simple_independent_chains(%arg0: !tf_res, %arg1: !tf_res, %arg2: tens
   tf_executor.graph {
     // CHECK: %[[A_CONTROL:.*]] = tf_executor.island wraps "tf.OpA"() : () -> ()
     %control_A = tf_executor.island wraps "tf.OpA"() : () -> ()
-    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:6, %[[WHILE_CONTROL:.*]] = tf_executor.island(%[[A_CONTROL]]) wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_2]], %[[CHAIN_CONSTANT]], %[[CHAIN_CONSTANT]])
     %while_out:4, %control_while = tf_executor.island(%control_A) wraps "tf.While"(%arg0, %arg1, %arg2, %arg2) {body = @simple_independent_chains_while_body, cond = @simple_independent_chains_while_cond, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, tensor<f32>)
     // CHECK: %[[B_CONTROL:.*]] = tf_executor.island(%[[WHILE_CONTROL]]) wraps "tf.OpB"() : () -> ()
@@ -117,7 +117,7 @@ func.func @intersecting_chains_while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2:
 func.func @intersecting_chains(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) {
   // CHECK: tf_executor.graph {
   tf_executor.graph {
-    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:5, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_2]], %[[CHAIN_CONSTANT]])
     %while_out:4, %while_control = tf_executor.island wraps "tf.While"(%arg0, %arg1, %arg2, %arg2) {body = @intersecting_chains_while_body, cond = @intersecting_chains_while_cond, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, tensor<f32>)
     // CHECK: tf_executor.fetch
@@ -167,12 +167,12 @@ func.func @multiple_callers_while_cond(%arg0: !tf_res, %arg1: tensor<f32>) -> (t
 func.func @multiple_callers(%arg0: !tf_res, %arg1: tensor<f32>) {
   // CHECK: tf_executor.graph {
   tf_executor.graph {
-    // CHECK: %[[CHAIN_CONSTANT_0:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT_0:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[CHAIN_CONSTANT_0]])
     %while_0_out:2, %while_0_control = tf_executor.island wraps "tf.While"(%arg0, %arg1) {body = @multiple_callers_while_body, cond = @multiple_callers_while_cond, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
     // CHECK: %[[CONTROL_A:.*]] = tf_executor.island(%[[WHILE_CONTROL]]) wraps "tf.OpA"() : () -> ()
     %control_A = tf_executor.island(%while_0_control) wraps "tf.OpA"() : () -> ()
-    // CHECK: %[[CHAIN_CONSTANT_1:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT_1:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island(%[[CONTROL_A]]) wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[CHAIN_CONSTANT_1]])
     %while_1_out:2, %while_1_control = tf_executor.island(%control_A) wraps "tf.While"(%arg0, %arg1) {body = @multiple_callers_while_body, cond = @multiple_callers_while_cond, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
     // CHECK: tf_executor.fetch
@@ -223,7 +223,7 @@ func.func @nested_loop_while_body_outer(%arg0: !tf_res, %arg1: tensor<f32>) -> (
   // CHECK: %[[GRAPH_OUT:.*]]:3 = tf_executor.graph {
   %graph:2 = tf_executor.graph {
     // CHECK: %{{.*}}, %[[CONTROL_CHAIN_0_SRC:.*]] = tf_executor.island wraps "tf.Identity"(%[[CHAIN_0]]) : (tensor<i32>) -> tensor<i32>
-    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island(%[[CONTROL_CHAIN_0_SRC]]) wraps "tf.While"(%[[RES_0]], %[[ARG_1]], %[[CHAIN_CONSTANT]])
     %while_out:2, %while_control = tf_executor.island() wraps "tf.While"(%arg0, %arg1) {body = @nested_loop_while_body_inner, cond = @nested_loop_while_cond_inner, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
     // CHECK: %[[CHAIN_0_SINK:.*]], %{{.*}} = tf_executor.island(%[[WHILE_CONTROL]]) wraps "tf.Identity"(%[[CHAIN_0]]) : (tensor<i32>) -> tensor<i32>
@@ -252,7 +252,7 @@ func.func @nested_loop_while_cond_outer(%arg0: !tf_res, %arg1: tensor<f32>) -> (
 func.func @nested_while(%arg0: !tf_res, %arg1: tensor<f32>) {
   // CHECK: tf_executor.graph {
   tf_executor.graph {
-    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CHAIN_CONSTANT:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK: %[[WHILE_OUT:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[CHAIN_CONSTANT]])
     %while_out:2, %while_control = tf_executor.island() wraps "tf.While"(%arg0, %arg1) {body = @nested_loop_while_body_outer, cond = @nested_loop_while_cond_outer, is_stateless = false} : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
     // CHECK: tf_executor.fetch
@@ -396,7 +396,7 @@ func.func @unique_resource_chain(%arg0: tensor<i32>, %arg1: tensor<f32>) {
 // CHECK-LABEL:   func @unique_resource_chain
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>
 // CHECK:           tf_executor.graph
-// CHECK:             %[[WHILE:.*]]:2, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]]) {body = @unique_resource_chain_while_body, cond = @unique_resource_chain_while_cond, is_stateless = false} : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+// CHECK:             %[[WHILE:.*]]:2, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]]) <{body = @unique_resource_chain_while_body, cond = @unique_resource_chain_while_cond, is_stateless = false}> : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
 // CHECK:             tf_executor.fetch
 // CHECK:           }
 // CHECK:           return
@@ -417,12 +417,12 @@ func.func @unique_resource_chain_while_body(%arg0: tensor<i32>, %arg1: tensor<f3
 // CHECK-LABEL:   func @unique_resource_chain_while_body
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>
 // CHECK:           %[[GRAPH:.*]]:2 = tf_executor.graph {
-// CHECK:             %[[THOUSAND:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1000> : tensor<i32>} : () -> tensor<i32>
-// CHECK:             %[[STACK_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.StackV2"(%[[THOUSAND]]) {elem_type = f32} : (tensor<i32>) -> tensor<!tf_type.resource<tensor<f32>>>
+// CHECK:             %[[THOUSAND:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1000> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:             %[[STACK_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.StackV2"(%[[THOUSAND]]) <{elem_type = f32}> : (tensor<i32>) -> tensor<!tf_type.resource<tensor<f32>>>
 // CHECK:             %{{.*}}, %[[STACK_PUSH_CONTROL:.*]] = tf_executor.island wraps "tf.StackPushV2"(%[[STACK_HANDLE]], %[[ARG_1]]) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
 // CHECK:             %[[ADD:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_1]], %[[ARG_1]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
 // CHECK:             %{{.*}}, %{{.*}} = tf_executor.island(%[[STACK_PUSH_CONTROL]]) wraps "tf.StackPushV2"(%[[STACK_HANDLE]], %[[ADD]]) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
-// CHECK:             %[[ONE:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:             %[[ONE:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:             %[[COUNTER:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_0]], %[[ONE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:             tf_executor.fetch %[[COUNTER]], %[[ARG_1]] : tensor<i32>, tensor<f32>
 // CHECK:           }
@@ -439,7 +439,7 @@ func.func @unique_resource_chain_while_cond(%arg0: tensor<i32>, %arg1: tensor<f3
 // CHECK-LABEL:   func @unique_resource_chain_while_cond
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>
 // CHECK:           %[[GRAPH:.*]] = tf_executor.graph
-// CHECK:             %[[CONST:.*]], %[[CONST_CONTROL:.*]] = tf_executor.island wraps "tf.Const"() {value = dense<1000> : tensor<i32>} : () -> tensor<i32>
+// CHECK:             %[[CONST:.*]], %[[CONST_CONTROL:.*]] = tf_executor.island wraps "tf.Const"() <{value = dense<1000> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:             %[[LESS:.*]], %[[LESS_CONTROL:.*]] = tf_executor.island wraps "tf.Less"(%[[CONST]], %[[ARG_0]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:             tf_executor.fetch %[[LESS]] : tensor<i1>
 // CHECK:           }
@@ -464,8 +464,8 @@ func.func @mixed_unique_resource_chain(%arg0: tensor<i32>, %arg1: tensor<f32>) {
 // CHECK-LABEL:   func @mixed_unique_resource_chain
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>
 // CHECK:           tf_executor.graph
-// CHECK:             %[[CHAIN_TOKEN:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK:             %[[WHILE:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[CHAIN_TOKEN]]) {body = @mixed_unique_resource_chain_while_body, cond = @mixed_unique_resource_chain_while_cond, is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<i32>) -> (tensor<i32>, tensor<f32>, tensor<i32>)
+// CHECK:             %[[CHAIN_TOKEN:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:             %[[WHILE:.*]]:3, %[[WHILE_CONTROL:.*]] = tf_executor.island wraps "tf.While"(%[[ARG_0]], %[[ARG_1]], %[[CHAIN_TOKEN]]) <{body = @mixed_unique_resource_chain_while_body, cond = @mixed_unique_resource_chain_while_cond, is_stateless = false}> : (tensor<i32>, tensor<f32>, tensor<i32>) -> (tensor<i32>, tensor<f32>, tensor<i32>)
 // CHECK:             tf_executor.fetch
 // CHECK:           }
 // CHECK:           return
@@ -489,14 +489,14 @@ func.func @mixed_unique_resource_chain_while_body(%arg0: tensor<i32>, %arg1: ten
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>, %[[CHAIN_TOKEN:.*]]: tensor<i32>
 // CHECK:           %[[GRAPH:.*]]:3 = tf_executor.graph
 // CHECK:             %{{.*}}, %[[CHAIN_SRC:.*]] = tf_executor.island wraps "tf.Identity"(%[[CHAIN_TOKEN]]) : (tensor<i32>) -> tensor<i32>
-// CHECK:             %[[THOUSAND:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1000> : tensor<i32>} : () -> tensor<i32>
-// CHECK:             %[[STACK_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.StackV2"(%[[THOUSAND]]) {elem_type = f32} : (tensor<i32>) -> tensor<!tf_type.resource<tensor<f32>>>
+// CHECK:             %[[THOUSAND:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1000> : tensor<i32>}> : () -> tensor<i32>
+// CHECK:             %[[STACK_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.StackV2"(%[[THOUSAND]]) <{elem_type = f32}> : (tensor<i32>) -> tensor<!tf_type.resource<tensor<f32>>>
 // CHECK:             %{{.*}}, %[[STACK_PUSH_CONTROL:.*]] = tf_executor.island wraps "tf.StackPushV2"(%[[STACK_HANDLE]], %[[ARG_1]]) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
 // CHECK:             %[[ADD:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_1]], %[[ARG_1]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
 // CHECK:             %{{.*}}, %{{.*}} = tf_executor.island(%[[STACK_PUSH_CONTROL]]) wraps "tf.StackPushV2"(%[[STACK_HANDLE]], %[[ADD]]) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
-// CHECK:             %[[ONE:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:             %[[ONE:.*]], %{{.*}} = tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:             %[[COUNTER:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_0]], %[[ONE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:             %[[VAR_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> tensor<!tf_type.resource<tensor<f32>>>
+// CHECK:             %[[VAR_HANDLE:.*]], %{{.*}} = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}> : () -> tensor<!tf_type.resource<tensor<f32>>>
 // CHECK:             %[[ASSIGN_CONTROL:.*]] = tf_executor.island(%[[CHAIN_SRC]]) wraps "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[ARG_1]]) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
 // CHECK:             %[[CHAIN_SINK:.*]], %{{.*}} = tf_executor.island(%[[ASSIGN_CONTROL]]) wraps "tf.Identity"(%[[CHAIN_TOKEN]]) : (tensor<i32>) -> tensor<i32>
 // CHECK:             tf_executor.fetch %[[COUNTER]], %[[ARG_1]], %[[CHAIN_SINK]] : tensor<i32>, tensor<f32>, tensor<i32>
@@ -514,7 +514,7 @@ func.func @mixed_unique_resource_chain_while_cond(%arg0: tensor<i32>, %arg1: ten
 // CHECK-LABEL:   func @mixed_unique_resource_chain_while_cond
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<f32>, %[[CHAIN_TOKEN:.*]]: tensor<i32>
 // CHECK:           %[[GRAPH:.*]] = tf_executor.graph
-// CHECK:             %[[CONST:.*]], %[[CONST_CONTROL:.*]] = tf_executor.island wraps "tf.Const"() {value = dense<1000> : tensor<i32>} : () -> tensor<i32>
+// CHECK:             %[[CONST:.*]], %[[CONST_CONTROL:.*]] = tf_executor.island wraps "tf.Const"() <{value = dense<1000> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:             %[[LESS:.*]], %[[LESS_CONTROL:.*]] = tf_executor.island wraps "tf.Less"(%[[CONST]], %[[ARG_0]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:             tf_executor.fetch %[[LESS]] : tensor<i1>
 // CHECK:           }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_launch_func_to_tf_call.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_launch_func_to_tf_call.mlir
index ea44e0fccfb6ee..4c532cdabc0eaa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert_launch_func_to_tf_call.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_launch_func_to_tf_call.mlir
@@ -11,8 +11,8 @@ func.func @single_launch_func(%arg0: tensor<?xf32>) -> tensor<?xf32> {
       %2 = "tf.A"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
       // CHECK: %[[CALL_OUTPUT:[0-9]*]] = "tf.PartitionedCall"(%[[A_OUTPUT]])
-      // CHECK-SAME: device = "/device:test_device:0"
       // CHECK-SAME: f = @_func
+      // CHECK-SAME: device = "/device:test_device:0"
       %3 = "tf_device.launch_func"(%2) {device = "/device:test_device:0", func = @_func} : (tensor<?xf32>) -> tensor<?xf32>
 
       // CHECK: tf_executor.yield %[[CALL_OUTPUT]]
@@ -40,13 +40,13 @@ func.func @multi_launch_func(%arg0: tensor<?xf32>) -> tensor<?xf32> {
       %2 = "tf.A"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
       // CHECK: %[[CALL_OUTPUT_0:[0-9]*]] = "tf.PartitionedCall"(%[[A_OUTPUT]])
-      // CHECK-SAME: device = "/device:test_device:0"
       // CHECK-SAME: f = @_func
+      // CHECK-SAME: device = "/device:test_device:0"
       %3 = "tf_device.launch_func"(%2) {device = "/device:test_device:0", func = @_func} : (tensor<?xf32>) -> tensor<?xf32>
 
       // CHECK: %[[CALL_OUTPUT_1:[0-9]*]] = "tf.PartitionedCall"(%[[CALL_OUTPUT_0]])
-      // CHECK-SAME: device = "/device:test_device:1"
       // CHECK-SAME: f = @_func
+      // CHECK-SAME: device = "/device:test_device:1"
       %4 = "tf_device.launch_func"(%3) {device = "/device:test_device:1", func = @_func} : (tensor<?xf32>) -> tensor<?xf32>
 
       // CHECK: tf_executor.yield %[[CALL_OUTPUT_1]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index 7af6ff18023782..300a766afb22ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -9,7 +9,7 @@
 // CHECK-LABEL: func @decomposition_outside_cluster
 func.func @decomposition_outside_cluster() {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<2x8xi32>>>
-  // CHECK:      %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+  // CHECK:      %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
   %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CLUSTER-ONLY: "tf.AssignAddVariableOp"
   // ALWAYS-DECOMPOSE-NOT:  "tf.AssignAddVariableOp"
@@ -74,7 +74,7 @@ func.func @decompose_use_subtype() {
   "tf_device.cluster"() ({
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<2x8xi32>>>
 
-    // CHECK:      %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK:      %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK:      %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"
     // CHECK-SAME: (tensor<*x!tf_type.resource<tensor<2x8xi32>>>) -> tensor<2x8xi32>
     // CHECK:      "tf.AddV2"(%[[RES_READ_VAL]], %[[ONE]])
@@ -98,7 +98,7 @@ func.func @decompose_assign_add_variable_op() -> () {
 
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<!tf_type.resource<tensor<i32>>>
 
-    // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"
     // CHECK: "tf.AddV2"(%[[RES_READ_VAL]], %[[ONE]])
     // CHECK: "tf.AssignVariableOp"
@@ -121,7 +121,7 @@ func.func @decompose_assign_sub_variable_op() -> () {
 
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<!tf_type.resource<tensor<i32>>>
 
-    // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"
     // CHECK: "tf.Sub"(%[[RES_READ_VAL]], %[[ONE]])
     // CHECK: "tf.AssignVariableOp"
@@ -323,8 +323,8 @@ func.func @decompose_resource_apply_adagradv2(%arg0: tensor<f32>, %arg1: tensor<
     // CHECK: [[VAR_DELTA:%.*]] = "tf.Div"([[LR_MULTIPLY]], [[DIVISOR]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
     // CHECK: [[OLD_VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]]) : (tensor<*x!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
     // CHECK: [[NEW_VAR:%.*]] = "tf.Sub"(%9, %8) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[NEW_VAR]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
-    // CHECK: "tf.AssignVariableOp"([[ACC_HANDLE]], [[NEW_ACC]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[NEW_VAR]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"([[ACC_HANDLE]], [[NEW_ACC]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
 
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
     %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
@@ -341,8 +341,8 @@ func.func @decompose_resource_apply_adagradv2(%arg0: tensor<f32>, %arg1: tensor<
 func.func @decompose_resource_apply_adagrad(%arg0: tensor<f32>, %arg1: tensor<f32>) -> () {
   "tf_device.cluster"() ({
 
-    // CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
-    // CHECK: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
+    // CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}> : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
+    // CHECK: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}> : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
     // CHECK: %[[ACCUM:.*]] = "tf.ReadVariableOp"(%[[ACCUM_HANDLE]]) : (tensor<*x!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
     // CHECK: %[[GRAD_SQUARE:.*]] = "tf.Mul"(%[[GRAD]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     // CHECK: %[[ACCUM_NEW:.*]] = "tf.AddV2"(%[[ACCUM]], %[[GRAD_SQUARE]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
@@ -351,8 +351,8 @@ func.func @decompose_resource_apply_adagrad(%arg0: tensor<f32>, %arg1: tensor<f3
     // CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_MULTIPLY]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
     // CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
     // CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
-    // CHECK: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
     %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
 
@@ -372,7 +372,7 @@ func.func @decompose_resource_apply_adagrad(%arg0: tensor<f32>, %arg1: tensor<f3
 func.func @decompose_resource_apply_adam_non_nesterov(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>) -> () {
   "tf_device.cluster"() ({
 
-    // CHECK: [[ONE:%.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK: [[ONE:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     // CHECK: [[VAR_HANDLE:%.*]] = "tf.VarHandleOp"()
     // CHECK: [[M_HANDLE:%.*]] = "tf.VarHandleOp"()
     // CHECK: [[V_HANDLE:%.*]] = "tf.VarHandleOp"()
@@ -422,10 +422,10 @@ func.func @decompose_resource_apply_adam_non_nesterov(%arg0: tensor<f32>, %arg1:
 func.func @decompose_resource_apply_adam_nesterov(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>) -> () {
   "tf_device.cluster"() ({
 
-  // CHECK: [[ONE:%.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
-  // CHECK: [[VAR_HANDLE:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
-  // CHECK: [[M_HANDLE:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
-  // CHECK: [[V_HANDLE:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
+  // CHECK: [[ONE:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
+  // CHECK: [[VAR_HANDLE:%.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}>
+  // CHECK: [[M_HANDLE:%.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}>
+  // CHECK: [[V_HANDLE:%.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}>
   // CHECK: [[VAL_82:%.*]] = "tf.Sub"([[ONE]], [[BETA2_POWER]])
   // CHECK: [[VAL_83:%.*]] = "tf.Sqrt"([[VAL_82]])
   // CHECK: [[VAL_84:%.*]] = "tf.Sub"([[ONE]], [[BETA1_POWER]])
@@ -452,9 +452,9 @@ func.func @decompose_resource_apply_adam_nesterov(%arg0: tensor<f32>, %arg1: ten
   // CHECK: [[VAL_105:%.*]] = "tf.Div"([[VAL_102]], [[VAL_104]])
   // CHECK: [[OLD_VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]]) : (tensor<*x!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
   // CHECK: [[NEW_VAR:%.*]] = "tf.Sub"([[OLD_VAR]], [[VAL_105]])
-  // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[NEW_VAR]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
-  // CHECK: "tf.AssignVariableOp"([[M_HANDLE]], [[NEW_M]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
-  // CHECK: "tf.AssignVariableOp"([[V_HANDLE]], [[NEW_V]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+  // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[NEW_VAR]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+  // CHECK: "tf.AssignVariableOp"([[M_HANDLE]], [[NEW_M]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
+  // CHECK: "tf.AssignVariableOp"([[V_HANDLE]], [[NEW_V]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<*xf32>>>, tensor<*xf32>) -> ()
 
     %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
     %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xf32>>>
@@ -474,7 +474,7 @@ func.func @decompose_resource_apply_adam_nesterov(%arg0: tensor<f32>, %arg1: ten
 func.func @decompose_adam_with_complex_inputs(%arg0: tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, %arg1: tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, %arg2: tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, %arg3: tensor<complex<f32>>, %arg4: tensor<complex<f32>>, %arg5: tensor<complex<f32>>, %arg6: tensor<complex<f32>>, %arg7: tensor<complex<f32>>, %arg8: tensor<complex<f32>>, %arg9: tensor<2xcomplex<f32>>) attributes {tf.entry_function = {control_outputs = "Adam/update_Variable_1/ResourceApplyAdam", inputs = "_arg0,_arg1,_arg2,_arg3,_arg4,_arg5,_arg6,_arg7,_arg8,_arg9", outputs = ""}} {
   "tf_device.cluster"() ({
 
-    // CHECK: "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+    // CHECK: "tf.Const"() <{value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>}> : () -> tensor<complex<f32>>
     // CHECK-NOT: tf.ResourceApplyAdam
     "tf.ResourceApplyAdam"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) {_XlaHasReferenceVars = false, _xla_inferred_shapes = [], device = "/job:localhost/replica:0/task:0/device:TPU:0", use_locking = false, use_nesterov = false} : (tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, tensor<!tf_type.resource<tensor<2xcomplex<f32>>>>, tensor<complex<f32>>, tensor<complex<f32>>, tensor<complex<f32>>, tensor<complex<f32>>, tensor<complex<f32>>, tensor<complex<f32>>, tensor<2xcomplex<f32>>) -> ()
 
@@ -489,13 +489,13 @@ func.func @decompose_adam_with_complex_inputs(%arg0: tensor<!tf_type.resource<te
 // CHECK-SAME: [[INDEX:%.+]]: tensor<?xi32>
 func.func @decompose_resource_gather_op(%indices : tensor<?xi32>) -> tensor<*xi32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: [[ZERO:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+    // CHECK: [[ZERO:%.+]] = "tf.Const"() <{value = dense<0> : tensor<i64>}>
 
     // CHECK: [[VAR:%.+]] = "tf.VarHandleOp"
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
 
     // CHECK: [[READVAR:%.+]] = "tf.ReadVariableOp"([[VAR]])
-    // CHECK: [[GATHER:%.+]] = "tf.GatherV2"([[READVAR]], [[INDEX]], [[ZERO]]) {batch_dims = 0 : i64} : (tensor<*xi32>, tensor<?xi32>, tensor<i64>) -> tensor<*xi32>
+    // CHECK: [[GATHER:%.+]] = "tf.GatherV2"([[READVAR]], [[INDEX]], [[ZERO]]) <{batch_dims = 0 : i64}> : (tensor<*xi32>, tensor<?xi32>, tensor<i64>) -> tensor<*xi32>
     // CHECK: return [[GATHER]]
     %1 = "tf.ResourceGather"(%resource, %indices) : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<?xi32>) -> (tensor<*xi32>)
     tf_device.return %1 : tensor<*xi32>
@@ -512,7 +512,7 @@ func.func @decompose_resource_gather_op_subtype(%indices : tensor<5xi32>) -> ten
   %0 = "tf_device.cluster"() ({
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<2x8x16xi32>>>
 
-    // CHECK: "tf.GatherV2"({{.+}}, {{.+}}, {{.+}}) {batch_dims = 1 : i64} : (tensor<2x8x16xi32>, tensor<5xi32>, tensor<i64>) -> tensor<2x5x16xi32>
+    // CHECK: "tf.GatherV2"({{.+}}, {{.+}}, {{.+}}) <{batch_dims = 1 : i64}> : (tensor<2x8x16xi32>, tensor<5xi32>, tensor<i64>) -> tensor<2x5x16xi32>
     %1 = "tf.ResourceGather"(%resource, %indices) {batch_dims = 1} : (tensor<*x!tf_type.resource<tensor<2x8x16xi32>>>, tensor<5xi32>) -> (tensor<2x5x16xi32>)
 
     tf_device.return %1 : tensor<2x5x16xi32>
@@ -527,7 +527,7 @@ func.func @decompose_resource_gather_op_subtype(%indices : tensor<5xi32>) -> ten
 // CHECK-SAME:  [[VAR:%.*]]: tensor<f32>, [[MG:%.*]]: tensor<f32>, [[MS:%.*]]: tensor<f32>, [[MOM:%.*]]: tensor<f32>, [[LR:%.*]]: tensor<f32>, [[RHO:%.*]]: tensor<f32>, [[MOMENTUM:%.*]]: tensor<f32>, [[EPSILON:%.*]]: tensor<f32>, [[GRAD:%.*]]: tensor<f32>
 func.func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<f32>) -> () {
   "tf_device.cluster"() ({
-    // CHECK: [[ONE:%.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK: [[ONE:%.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     // CHECK: [[VAR_HANDLE:%.*]] = "tf.VarHandleOp"
     // CHECK: [[MG_HANDLE:%.*]] = "tf.VarHandleOp"
     // CHECK: [[MS_HANDLE:%.*]] = "tf.VarHandleOp"
@@ -578,14 +578,14 @@ func.func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1:
 // CHECK-SAME:   %[[LR:.*]]: tensor<f32>, %[[RHO:.*]]: tensor<f32>, %[[MOMENTUM:.*]]: tensor<f32>, %[[EPSILON:.*]]: tensor<f32>, %[[GRAD:.*]]: tensor<f32>)
 func.func @decompose_resource_apply_RMS_prop(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>) -> () {
   "tf_device.cluster"() ({
-    // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
     // CHECK: %[[MS:.*]] = "tf.ReadVariableOp"(%[[MS_HANDLE]]) : (tensor<*x!tf_type.resource>) -> tensor<*xf32>
     // CHECK: %[[MS_RHO:.*]] = "tf.Mul"(%[[MS]], %[[RHO]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
     // CHECK: %[[GRAD_SQUARE:.*]] = "tf.Square"(%[[GRAD]]) : (tensor<f32>) -> tensor<f32>
     // CHECK: %[[ONE_RHO:.*]] = "tf.Sub"(%[[ONE]], %[[RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[GRAD_SQUARE]], %[[ONE_RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     // CHECK: %[[MS_NEW:.*]] = "tf.AddV2"(%[[MS_RHO]], %[[MUL]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-    // CHECK: "tf.AssignVariableOp"(%[[MS_HANDLE]], %[[MS_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"(%[[MS_HANDLE]], %[[MS_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
     // CHECK: %[[MOM:.*]] = "tf.ReadVariableOp"(%[[MOM_HANDLE]]) : (tensor<*x!tf_type.resource>) -> tensor<*xf32>
     // CHECK: %[[MOMENTUM_MOM:.*]] = "tf.Mul"(%[[MOMENTUM]], %[[MOM]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
     // CHECK: %[[LR_GRAD:.*]] = "tf.Mul"(%[[LR]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -593,10 +593,10 @@ func.func @decompose_resource_apply_RMS_prop(%arg0: tensor<*x!tf_type.resource>,
     // CHECK: %[[SQRT:.*]] = "tf.Sqrt"(%[[ADD]]) : (tensor<*xf32>) -> tensor<*xf32>
     // CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_GRAD]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
     // CHECK: %[[MOM_NEW:.*]] = "tf.AddV2"(%[[MOMENTUM_MOM]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    // CHECK: "tf.AssignVariableOp"(%[[MOM_HANDLE]], %[[MOM_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"(%[[MOM_HANDLE]], %[[MOM_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
     // CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf_type.resource>) -> tensor<*xf32>
     // CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[MOM_NEW]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
+    // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource>, tensor<*xf32>) -> ()
     "tf.ResourceApplyRMSProp"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) {use_locking = false} : (tensor<*x!tf_type.resource>, tensor<*x!tf_type.resource>, tensor<*x!tf_type.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
     tf_device.return
   }) : () -> ()
@@ -609,7 +609,7 @@ func.func @decompose_resource_apply_RMS_prop(%arg0: tensor<*x!tf_type.resource>,
 // CHECK-LABEL: @decompose_resource_scatter_add_op
 // CHECK-SAME: ([[INDEX:%.+]]: tensor<2x?xi32>, [[UPDATE:%.+]]: tensor<?x?x?xi32>)
 func.func @decompose_resource_scatter_add_op(%indices : tensor<2x?xi32>, %updates: tensor<?x?x?xi32>) {
-  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
   "tf_device.cluster"() ({
     // CHECK: [[VAR:%.+]] = "tf.VarHandleOp"
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
@@ -630,7 +630,7 @@ func.func @decompose_resource_scatter_add_op(%indices : tensor<2x?xi32>, %update
 // CHECK-LABEL: @decompose_resource_scatter_add_op_1d_indices
 // CHECK-SAME: ([[INDEX:%.+]]: tensor<?xi32>, [[UPDATE:%.+]]: tensor<?x?x?xi32>)
 func.func @decompose_resource_scatter_add_op_1d_indices(%indices : tensor<?xi32>, %updates: tensor<?x?x?xi32>) {
-  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
   "tf_device.cluster"() ({
     // CHECK: [[VAR:%.+]] = "tf.VarHandleOp"
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
@@ -679,7 +679,7 @@ func.func @decompose_ResourceScatterAdd_with_unranked_updates(%resource : tensor
 // CHECK-LABEL: @decompose_resource_scatter_update_op
 // CHECK-SAME: ([[INDEX:%.+]]: tensor<2x?xi32>, [[UPDATE:%.+]]: tensor<?x?x?xi32>)
 func.func @decompose_resource_scatter_update_op(%indices : tensor<2x?xi32>, %updates: tensor<?x?x?xi32>) {
-  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
   "tf_device.cluster"() ({
     // CHECK: [[VAR:%.+]] = "tf.VarHandleOp"
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
@@ -782,10 +782,10 @@ func.func @decompose_resource_apply_proximal_adagrad_op(%lr: tensor<f32>, %l1: t
     %var = "tf.VarHandleOp"() {container = "c", shared_name = "var"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
     %accum = "tf.VarHandleOp"() {container = "c", shared_name = "accum"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
 
-    // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "var"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
-    // CHECK-DAG: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "accum"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
+    // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "var"}> : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
+    // CHECK-DAG: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "accum"}> : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
     // CHECK-DAG: %[[GRAD_SQ:.*]] = "tf.Square"(%[[GRAD]]) : (tensor<4xf32>) -> tensor<4xf32>
     // CHECK-DAG: %[[ACCUM:.*]] = "tf.ReadVariableOp"(%[[ACCUM_HANDLE]]) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<4xf32>
     // CHECK-DAG: %[[ACCUM_NEW:.*]] = "tf.AddV2"(%[[ACCUM]], %[[GRAD_SQ]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
@@ -805,8 +805,8 @@ func.func @decompose_resource_apply_proximal_adagrad_op(%lr: tensor<f32>, %l1: t
     // CHECK-DAG: %[[SCALED_L2:.*]] = "tf.Mul"(%[[ADAGRAD_LR]], %[[L2]]) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
     // CHECK-DAG: %[[DENOMINATOR:.*]] = "tf.Add"(%[[ONE]], %[[SCALED_L2]]) : (tensor<f32>, tensor<4xf32>) -> tensor<4xf32>
     // CHECK-DAG: %[[VAR_NEW:.*]] = "tf.Div"(%[[NUMERATOR]], %[[DENOMINATOR]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-    // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
-    // CHECK-DAG: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+    // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+    // CHECK-DAG: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
 
     "tf.ResourceApplyProximalAdagrad"(%var, %accum, %lr, %l1, %l2, %grad) {use_locking = false} : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<4xf32>) -> ()
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
index e1071f3e899c00..6666a08dfde322 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -6,7 +6,7 @@ func.func @unary_einsum_reduce_sum_transpose(%arg0: tensor<3x4x5x6xf32>) -> tens
   // CHECK-LABEL: unary_einsum_reduce_sum_transpose
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<3> : tensor<1xi32>
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
-  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) <{keep_dims = false}> : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
   // CHECK: %[[v1:.*]] = "tf.Transpose"(%[[v0]], %[[cst_1]]) : (tensor<3x4x5xf32>, tensor<3xi32>) -> tensor<3x5x4xf32>
   // CHECK: return %[[v1]] : tensor<3x5x4xf32>
 }
@@ -16,7 +16,7 @@ func.func @unary_einsum_reduce_sum_transpose1(%arg0: tensor<3x4x5x6xf32>) -> ten
   func.return %0 : tensor<3x4x5xf32>
   // CHECK-LABEL: unary_einsum_reduce_sum_transpose1
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<3> : tensor<1xi32>
-  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) <{keep_dims = false}> : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
   // CHECK: return %[[v0]] : tensor<3x4x5xf32>
 }
 
@@ -34,7 +34,7 @@ func.func @unary_einsum_reduce_sum(%arg0: tensor<4x5x6xf32>) -> tensor<4xf32> {
   func.return %0 : tensor<4xf32>
   // CHECK-LABEL: unary_einsum_reduce_sum
   // CHECK-DAG: %[[cst:.*]] =  arith.constant dense<[1, 2]> : tensor<2xi32>
-  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<4x5x6xf32>, tensor<2xi32>) -> tensor<4xf32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) <{keep_dims = false}> : (tensor<4x5x6xf32>, tensor<2xi32>) -> tensor<4xf32>
   // CHECK: return %[[v0]]
 }
 
@@ -42,14 +42,14 @@ func.func @einsum_basic(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> t
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
   // CHECK-LABEL: einsum_basic
-  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
 }
 
 func.func @einsum_matmul(%arg0: tensor<7x9xf32>, %arg1: tensor<9x5xf32>) -> tensor<7x5xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ae,ed->ad"}: (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
   func.return %0 : tensor<7x5xf32>
   // CHECK-LABEL: einsum_matmul
-  // CHECK: %[[v0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
+  // CHECK: %[[v0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
   // CHECK: return %[[v0]] : tensor<7x5xf32>
 }
 
@@ -59,7 +59,7 @@ func.func @einsum_matmul_dynamic_size(%arg0: tensor<2x?x?x?xf32>, %arg1: tensor<
   // CHECK-LABEL: einsum_matmul_dynamic_size
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[2, -1, 1, 1]> : tensor<4xi64>
   // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg1, %cst) : (tensor<2x?xf32>, tensor<4xi64>) -> tensor<2x?x1x1xf32>
-  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %0) {adj_x = false, adj_y = false} : (tensor<2x?x?x?xf32>, tensor<2x?x1x1xf32>) -> tensor<2x?x?x1xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %0) <{adj_x = false, adj_y = false}> : (tensor<2x?x?x?xf32>, tensor<2x?x1x1xf32>) -> tensor<2x?x?x1xf32>
   // CHECK: return %[[v1]] : tensor<2x?x?x1xf32>
 }
 
@@ -67,14 +67,14 @@ func.func @einsum_broadcast(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6xf32>) ->
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,km->ijm"}: (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
   // CHECK-LABEL: einsum_broadcast
-  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
 }
 
 func.func @einsum_broadcast4(%arg0: tensor<3x4x5x6x7xf32>, %arg1: tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "abcdh,hg->abcdg"}: (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
   func.return %0 : tensor<3x4x5x6x8xf32>
   // CHECK-LABEL: einsum_broadcast4
-  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
 }
 
 func.func @einsum_reducesum(%arg0: tensor<2x5x7xf32>, %arg1: tensor<5x2xf32>) -> tensor<5x7xf32> {
@@ -86,7 +86,7 @@ func.func @einsum_reducesum(%arg0: tensor<2x5x7xf32>, %arg1: tensor<5x2xf32>) ->
   // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<[5, 7]> : tensor<2xi64>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7xf32>, tensor<3xi32>) -> tensor<5x7x2xf32>
   // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg1, %[[cst_1]]) : (tensor<5x2xf32>, tensor<3xi64>) -> tensor<5x2x1xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<5x7x2xf32>, tensor<5x2x1xf32>) -> tensor<5x7x1xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<5x7x2xf32>, tensor<5x2x1xf32>) -> tensor<5x7x1xf32>
   // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_2]]) : (tensor<5x7x1xf32>, tensor<2xi64>) -> tensor<5x7xf32>
   // CHECK: return %[[v3:.*]] : tensor<5x7xf32>
 }
@@ -99,7 +99,7 @@ func.func @einsum_transpose_matmul(%arg0: tensor<2x5x7xf32>, %arg1: tensor<5x3x2
   // CHECK-DAG: %[[cst_0:.*]] = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7xf32>, tensor<3xi32>) -> tensor<5x7x2xf32>
   // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_0]]) : (tensor<5x3x2xf32>, tensor<3xi32>) -> tensor<5x2x3xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<5x7x2xf32>, tensor<5x2x3xf32>) -> tensor<5x7x3xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<5x7x2xf32>, tensor<5x2x3xf32>) -> tensor<5x7x3xf32>
   // CHECK: %[[v3:.*]] = "tf.Transpose"(%[[v2]], %[[cst_0]]) : (tensor<5x7x3xf32>, tensor<3xi32>) -> tensor<5x3x7xf32>
 }
 
@@ -111,7 +111,7 @@ func.func @einsum_4D(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x4x7x3xf32>) ->
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<4xi32>) -> tensor<2x7x5x3xf32>
   // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<2x4x7x3xf32>, tensor<4xi32>) -> tensor<2x7x3x4xf32>
-  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x7x5x3xf32>, tensor<2x7x3x4xf32>) -> tensor<2x7x5x4xf32>
+  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<2x7x5x3xf32>, tensor<2x7x3x4xf32>) -> tensor<2x7x5x4xf32>
 }
 
 func.func @einsum_matrixdotprod(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<7x3x4xf32>) -> tensor<2x5x4xf32> {
@@ -122,7 +122,7 @@ func.func @einsum_matrixdotprod(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<7x3x4x
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[21, 4]> : tensor<2xi64>
   // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<3xi64>) -> tensor<2x5x21xf32>
   // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg1, %[[cst_1]]) : (tensor<7x3x4xf32>, tensor<2xi64>) -> tensor<21x4xf32>
-  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x5x21xf32>, tensor<21x4xf32>) -> tensor<2x5x4xf32>
+  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<2x5x21xf32>, tensor<21x4xf32>) -> tensor<2x5x4xf32>
 }
 
 func.func @einsum_reshapetail(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6x2xf32>) -> tensor<3x4x6x2xf32> {
@@ -132,7 +132,7 @@ func.func @einsum_reshapetail(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6x2xf32>
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[5, 12]> : tensor<2xi64>
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[3, 4, 6, 2]> : tensor<4xi64>
   // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg1, %[[cst]]) : (tensor<5x6x2xf32>, tensor<2xi64>) -> tensor<5x12xf32>
-  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<5x12xf32>) -> tensor<3x4x12xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) <{adj_x = false, adj_y = false}> : (tensor<3x4x5xf32>, tensor<5x12xf32>) -> tensor<3x4x12xf32>
   // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<3x4x12xf32>, tensor<4xi64>) -> tensor<3x4x6x2xf32>
   // CHECK: return %[[v2]] : tensor<3x4x6x2xf32>
 }
@@ -144,7 +144,7 @@ func.func @einsum_reduceddim(%arg0: tensor<2x5x7xf32>, %arg1: tensor<2x5x7x3xf32
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[2, 5, 1, 7]> : tensor<4xi64>
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[2, 5, 3]> : tensor<3xi64>
   // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x5x7xf32>, tensor<4xi64>) -> tensor<2x5x1x7xf32>
-  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%[[v0]], %arg1) {adj_x = false, adj_y = false} : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%[[v0]], %arg1) <{adj_x = false, adj_y = false}> : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
   // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<2x5x1x3xf32>, tensor<3xi64>) -> tensor<2x5x3xf32>
   // CHECK: return %[[v2]] : tensor<2x5x3xf32>
 }
@@ -158,7 +158,7 @@ func.func @einsum_transposereduceddim(%arg0: tensor<2x5x7xf32>, %arg1: tensor<2x
   // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<[2, 5, 3]> : tensor<3xi64>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<2x5x3x7xf32>, tensor<4xi32>) -> tensor<2x5x7x3xf32>
   // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg0, %[[cst_1]]) : (tensor<2x5x7xf32>, tensor<4xi64>) -> tensor<2x5x1x7xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v1]], %[[v0]]) {adj_x = false, adj_y = false} : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v1]], %[[v0]]) <{adj_x = false, adj_y = false}> : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
   // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_2]]) : (tensor<2x5x1x3xf32>, tensor<3xi64>) -> tensor<2x5x3xf32>
   // CHECK: return %[[v3]] : tensor<2x5x3xf32>
 }
@@ -169,7 +169,7 @@ func.func @einsum_fourdreducelast(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x3x
   // CHECK-LABEL: einsum_fourdreducelast
   // CHECK: %[[cst:.*]] = arith.constant dense<[0, 2, 1, 3]> : tensor<4xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<2x3x5x13xf32>, tensor<4xi32>) -> tensor<2x5x3x13xf32>
-  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) {adj_x = false, adj_y = false} : (tensor<2x5x7x3xf32>, tensor<2x5x3x13xf32>) -> tensor<2x5x7x13xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) <{adj_x = false, adj_y = false}> : (tensor<2x5x7x3xf32>, tensor<2x5x3x13xf32>) -> tensor<2x5x7x13xf32>
   // CHECK: %[[v2:.*]] = "tf.Transpose"(%[[v1]], %[[cst]]) : (tensor<2x5x7x13xf32>, tensor<4xi32>) -> tensor<2x7x5x13xf32>
   // CHECK: return %[[v2]] : tensor<2x7x5x13xf32>
 }
@@ -183,7 +183,7 @@ func.func @einsum_fourdtransposeall(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x
   // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<[0, 1, 3, 2]> : tensor<4xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<4xi32>) -> tensor<2x7x5x3xf32>
   // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<2x11x7x3xf32>, tensor<4xi32>) -> tensor<2x7x3x11xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x7x5x3xf32>, tensor<2x7x3x11xf32>) -> tensor<2x7x5x11xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<2x7x5x3xf32>, tensor<2x7x3x11xf32>) -> tensor<2x7x5x11xf32>
   // CHECK: %[[v3:.*]] = "tf.Transpose"(%[[v2]], %[[cst_2]]) : (tensor<2x7x5x11xf32>, tensor<4xi32>) -> tensor<2x7x11x5xf32>
   // CHECK: return %[[v3]] : tensor<2x7x11x5xf32>
 }
@@ -196,7 +196,7 @@ func.func @einsum_4d_1(%arg0: tensor<3x4x5x6xf32>, %arg1: tensor<3x7x5x6xf32>) -
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst:.*]]) : (tensor<3x4x5x6xf32>, tensor<4xi32>) -> tensor<3x5x4x6xf32>
   // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<3x7x5x6xf32>, tensor<4xi32>) -> tensor<3x5x6x7xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<3x5x4x6xf32>, tensor<3x5x6x7xf32>) -> tensor<3x5x4x7xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) <{adj_x = false, adj_y = false}> : (tensor<3x5x4x6xf32>, tensor<3x5x6x7xf32>) -> tensor<3x5x4x7xf32>
   // CHECK: return %[[v2]] : tensor<3x5x4x7xf32>
 }
 
@@ -204,7 +204,7 @@ func.func @einsum_no_match(%arg0: tensor<4x5x6xf32>, %arg1: tensor<5xf32>) -> te
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,j->i"}: (tensor<4x5x6xf32>, tensor<5xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 // CHECK-LABEL: einsum_no_match
-// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,j->i"} : (tensor<4x5x6xf32>, tensor<5xf32>) -> tensor<4xf32>
+// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "ijk,j->i"}> {T = "tfdtype$DT_FLOAT"} : (tensor<4x5x6xf32>, tensor<5xf32>) -> tensor<4xf32>
 // CHECK: return %[[v0]]
 }
 
@@ -212,7 +212,7 @@ func.func @einsum_illegal_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>)
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 // CHECK-LABEL: einsum_illegal_no_match
-// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "ij,?zw->kq->i"}> {T = "tfdtype$DT_FLOAT"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
 // CHECK: return %[[v0]]
 }
 
@@ -223,7 +223,7 @@ func.func @batch_multilhs_einsum(%arg0: tensor<2x1x1x11xf32>, %arg1: tensor<2x11
 // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[2, 1, 11]> : tensor<3xi64>
 // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[2, 1, 1, 2]> : tensor<4xi64>
 // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x1x1x11xf32>, tensor<3xi64>) -> tensor<2x1x11xf32>
-// CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%[[v0]], %arg1) {adj_x = false, adj_y = false} : (tensor<2x1x11xf32>, tensor<2x11x2xf32>) -> tensor<2x1x2xf32>
+// CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%[[v0]], %arg1) <{adj_x = false, adj_y = false}> : (tensor<2x1x11xf32>, tensor<2x11x2xf32>) -> tensor<2x1x2xf32>
 // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<2x1x2xf32>, tensor<4xi64>) -> tensor<2x1x1x2xf32>
 // CHECK: return %[[v2]] : tensor<2x1x1x2xf32>
 }
@@ -236,14 +236,14 @@ func.func @einsum_with_runtime_outputshape1(%arg0 : tensor<?x36x32xf32>, %arg1 :
 // CHECK-DAG: %[[cst_0:.*]] = arith.constant dense<[-1, 36, 1, 32]> : tensor<4xi64>
 // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 1]> : tensor<2xi32>
 // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<2> : tensor<1xi32>
-// CHECK-DAG: %[[cst_3:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[cst_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %cst) : (tensor<?x36x?x32xf32>, tensor<4xi32>) -> tensor<?x36x32x?xf32>
 // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg0, %cst_0) : (tensor<?x36x32xf32>, tensor<4xi64>) -> tensor<?x36x1x32xf32>
-// CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%1, %0) {adj_x = false, adj_y = false} : (tensor<?x36x1x32xf32>, tensor<?x36x32x?xf32>) -> tensor<?x36x1x?xf32>
+// CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%1, %0) <{adj_x = false, adj_y = false}> : (tensor<?x36x1x32xf32>, tensor<?x36x32x?xf32>) -> tensor<?x36x1x?xf32>
 // CHECK: %[[v3:.*]] = "tf.Shape"(%arg0) : (tensor<?x36x32xf32>) -> tensor<3xi32>
 // CHECK: %[[v4:.*]] = "tf.Shape"(%arg1) : (tensor<?x36x?x32xf32>) -> tensor<4xi32>
-// CHECK: %[[v5:.*]] = "tf.Gather"(%3, %cst_1) {validate_indices = true} : (tensor<3xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK: %[[v6:.*]] = "tf.Gather"(%4, %cst_2) {validate_indices = true} : (tensor<4xi32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK: %[[v5:.*]] = "tf.Gather"(%3, %cst_1) <{validate_indices = true}> : (tensor<3xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK: %[[v6:.*]] = "tf.Gather"(%4, %cst_2) <{validate_indices = true}> : (tensor<4xi32>, tensor<1xi32>) -> tensor<1xi32>
 // CHECK: %[[v7:.*]] = "tf.Concat"(%cst_3, %5, %6) : (tensor<i32>, tensor<2xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK: %[[v8:.*]] = "tf.Reshape"(%2, %7) : (tensor<?x36x1x?xf32>, tensor<3xi32>) -> tensor<?x36x?xf32>
 // CHECK: return %[[v8]] : tensor<?x36x?xf32>
@@ -254,13 +254,13 @@ func.func @einsum_with_runtime_outputshape2(%arg0 : tensor<?x?x1024xf32>, %arg1
   func.return %0 : tensor<?x?x8x128xf32>
 // CHECK-LABEL: einsum_with_runtime_outputshape2
 // CHECK-DAG: %[[cst:.*]] = arith.constant dense<1024> : tensor<2xi64>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<[8, 128]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<[8, 128]> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 1]> : tensor<2xi32>
-// CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg1, %cst) : (tensor<1024x8x128xf32>, tensor<2xi64>) -> tensor<1024x1024xf32>
-// CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %0) {adj_x = false, adj_y = false} : (tensor<?x?x1024xf32>, tensor<1024x1024xf32>) -> tensor<?x?x1024xf32>
+// CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %0) <{adj_x = false, adj_y = false}> : (tensor<?x?x1024xf32>, tensor<1024x1024xf32>) -> tensor<?x?x1024xf32>
 // CHECK: %[[v2:.*]] = "tf.Shape"(%arg0) : (tensor<?x?x1024xf32>) -> tensor<3xi32>
-// CHECK: %[[v3:.*]] = "tf.Gather"(%2, %cst_1) {validate_indices = true} : (tensor<3xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK: %[[v3:.*]] = "tf.Gather"(%2, %cst_1) <{validate_indices = true}> : (tensor<3xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK: %[[v4:.*]] = "tf.Concat"(%cst_2, %3, %cst_0) : (tensor<i32>, tensor<2xi32>, tensor<2xi32>) -> tensor<4xi32>
 // CHECK: %[[v5:.*]] = "tf.Reshape"(%1, %4) : (tensor<?x?x1024xf32>, tensor<4xi32>) -> tensor<?x?x8x128xf32>
 // CHECK: return %[[v5]] : tensor<?x?x8x128xf32>
@@ -275,7 +275,7 @@ func.func @einsum_with_runtime_shape1(%arg0 : tensor<?x36x?xf32>, %arg1 : tensor
 // CHECK: %[[v0:.*]] = "tf.Shape"(%arg0) : (tensor<?x36x?xf32>) -> tensor<3xi32>
 // CHECK: %[[v1:.*]] = "tf.UnsortedSegmentProd"(%0, %cst, %cst_0) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<4xi32>
 // CHECK: %[[v2:.*]] = "tf.Reshape"(%arg0, %1) : (tensor<?x36x?xf32>, tensor<4xi32>) -> tensor<?x36x1x?xf32>
-// CHECK: %[[v3:.*]] = "tf.BatchMatMulV2"(%2, %arg1) {adj_x = false, adj_y = false} : (tensor<?x36x1x?xf32>, tensor<?x36x?x32xf32>) -> tensor<?x36x1x32xf32>
+// CHECK: %[[v3:.*]] = "tf.BatchMatMulV2"(%2, %arg1) <{adj_x = false, adj_y = false}> : (tensor<?x36x1x?xf32>, tensor<?x36x?x32xf32>) -> tensor<?x36x1x32xf32>
 // CHECK: %[[v4:.*]] =  "tf.Reshape"(%3, %cst_1) : (tensor<?x36x1x32xf32>, tensor<3xi64>) -> tensor<?x36x32xf32>
 // CHECK: return %[[v4]] : tensor<?x36x32xf32>
 }
@@ -286,14 +286,14 @@ func.func @einsum_with_runtime_shape2(%arg0 : tensor<?x?x8x64xf32>, %arg1 : tens
 // CHECK-LABEL: einsum_with_runtime_shape2
 // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
 // CHECK-DAG: %[[cst_0:.*]] = arith.constant dense<[0, 1, 2, 2]> : tensor<4xi32>
-// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<[512, 8]> : tensor<2xi64>
 // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %cst) : (tensor<8x8x64xf32>, tensor<3xi32>) -> tensor<8x64x8xf32>
 // CHECK: %[[v1:.*]] = "tf.Shape"(%arg0) : (tensor<?x?x8x64xf32>) -> tensor<4xi32>
 // CHECK: %[[v2:.*]] = "tf.UnsortedSegmentProd"(%1, %cst_0, %cst_1) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<3xi32>
 // CHECK: %[[v3:.*]] = "tf.Reshape"(%arg0, %2) : (tensor<?x?x8x64xf32>, tensor<3xi32>) -> tensor<?x?x512xf32>
 // CHECK: %[[v4:.*]] = "tf.Reshape"(%0, %cst_2) : (tensor<8x64x8xf32>, tensor<2xi64>) -> tensor<512x8xf32>
-// CHECK: %[[v5:.*]] = "tf.BatchMatMulV2"(%3, %4) {adj_x = false, adj_y = false} : (tensor<?x?x512xf32>, tensor<512x8xf32>) -> tensor<?x?x8xf32>
+// CHECK: %[[v5:.*]] = "tf.BatchMatMulV2"(%3, %4) <{adj_x = false, adj_y = false}> : (tensor<?x?x512xf32>, tensor<512x8xf32>) -> tensor<?x?x8xf32>
 // CHECK: return %[[v5]] : tensor<?x?x8xf32>
 }
 
@@ -305,7 +305,7 @@ func.func @einsum_no_reshape(%arg0 : tensor<?x?x8x128xf32>, %arg1 : tensor<1x?x8
 // CHECK-DAG: %[[cst_0:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
 // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %cst) : (tensor<?x?x8x128xf32>, tensor<4xi32>) -> tensor<?x8x?x128xf32>
 // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %cst_0) : (tensor<1x?x8x128xf32>, tensor<4xi32>) -> tensor<1x8x128x?xf32>
-// CHECK: %[[v3:.*]] = "tf.BatchMatMulV2"(%0, %1) {adj_x = false, adj_y = false} : (tensor<?x8x?x128xf32>, tensor<1x8x128x?xf32>) -> tensor<?x8x?x?xf32>
+// CHECK: %[[v3:.*]] = "tf.BatchMatMulV2"(%0, %1) <{adj_x = false, adj_y = false}> : (tensor<?x8x?x128xf32>, tensor<1x8x128x?xf32>) -> tensor<?x8x?x?xf32>
 // CHECK: %[[v4:.*]] =  "tf.Transpose"(%2, %cst) : (tensor<?x8x?x?xf32>, tensor<4xi32>) -> tensor<?x?x8x?xf32>
 // CHECK: return %[[v4]] : tensor<?x?x8x?xf32>
 }
@@ -314,7 +314,7 @@ func.func @einsum_ellipsis(%arg0: tensor<1x512x128xf32>, %arg1: tensor<128x256xf
   %0 = "tf.Einsum"(%arg0, %arg1) {device = "", equation = "...x,xy->...y"} : (tensor<1x512x128xf32>, tensor<128x256xf32>) -> tensor<1x512x256xf32>
   func.return %0 : tensor<1x512x256xf32>
 // CHECK-LABEL: einsum_ellipsis
-// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x512x128xf32>, tensor<128x256xf32>) -> tensor<1x512x256xf32>
+// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<1x512x128xf32>, tensor<128x256xf32>) -> tensor<1x512x256xf32>
 }
 
 func.func @einsum_ellipsis_in_both_sides(%arg0: tensor<1x11x19xf32>, %arg1: tensor<7x11x13x19xf32>) -> tensor<7x11x13xf32> {
@@ -326,7 +326,7 @@ func.func @einsum_ellipsis_in_both_sides(%arg0: tensor<1x11x19xf32>, %arg1: tens
   // CHECK-DAG: %[[cst_2:.*]] = arith.constant dense<[7, 11, 13]> : tensor<3xi64>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<7x11x13x19xf32>, tensor<4xi32>) -> tensor<7x11x19x13xf32>
   // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg0, %[[cst_1]]) : (tensor<1x11x19xf32>, tensor<4xi64>) -> tensor<1x11x1x19xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v1]], %[[v0]]) {adj_x = false, adj_y = false} : (tensor<1x11x1x19xf32>, tensor<7x11x19x13xf32>) -> tensor<7x11x1x13xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v1]], %[[v0]]) <{adj_x = false, adj_y = false}> : (tensor<1x11x1x19xf32>, tensor<7x11x19x13xf32>) -> tensor<7x11x1x13xf32>
   // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_2]]) : (tensor<7x11x1x13xf32>, tensor<3xi64>) -> tensor<7x11x13xf32>
   // CHECK: return %[[v3]] : tensor<7x11x13xf32>
 }
@@ -338,7 +338,7 @@ func.func @einsum_ellipsis_with_broadcast(%arg0: tensor<5x4x3xf32>, %arg1: tenso
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
   // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<3x2x1xf32>, tensor<3xi32>) -> tensor<1x3x2xf32>
-  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) {adj_x = false, adj_y = false} : (tensor<5x4x3xf32>, tensor<1x3x2xf32>) -> tensor<5x4x2xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) <{adj_x = false, adj_y = false}> : (tensor<5x4x3xf32>, tensor<1x3x2xf32>) -> tensor<5x4x2xf32>
   // CHECK: %[[v2:.*]] = "tf.Transpose"(%[[v1]], %[[cst_1]]) : (tensor<5x4x2xf32>, tensor<3xi32>) -> tensor<4x2x5xf32>
   // CHECK: return %[[v2]] : tensor<4x2x5xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
index fdb64b900e7a8d..8aa1cf650550f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
@@ -17,7 +17,7 @@ module {
     // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_1.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
-    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.*}}
+    // CHECK: {{.*tf.While.* <{body = @new_while_body.* cond = @new_while_cond.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm2.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm1.*}}
     // CHECK: return
@@ -73,7 +73,7 @@ module {
     // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_1.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
-    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.*}}
+    // CHECK: {{.*tf.While.* <{body = @new_while_body.* cond = @new_while_cond.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm2.*}}
     // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm1.*}}
     // CHECK: return
@@ -112,7 +112,7 @@ module {
   func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
     // The pipelining control flow and supporting functions stay the same as the training version above.
     // The order of these functions is also significant.
-    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.* parallel_iterations = 3}}
+    // CHECK: {{.*tf.While.* <{body = @new_while_body.* cond = @new_while_cond.* parallel_iterations = 3}}
     // CHECK: return
     // metadata ops
     "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
index 488c98af16f55f..87acd459ed1350 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
@@ -484,7 +484,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:       key = "host_compute_channel_1_args"
     // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
-    // CHECK:          "tf_device.cluster"
+    // CHECK:          "tf_device.cluster"()
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
     // CHECK-SAME:       send_key = "host_compute_channel_0_args"
@@ -581,14 +581,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:      key = "if_predicate_channel_1"
     // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:            _else_func_name = "test_else_name"
+    // CHECK-SAME:       _then_func_name = "test_then_name"
+    // CHECK-SAME:       is_stateless = false
     // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:         key = "host_compute_channel_0_args"
     // CHECK:              "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
     // CHECK-NOT:          "tf._XlaSendFromHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK:              "tf.Yield"() : () -> ()
-    // CHECK:            _else_func_name = "test_else_name"
-    // CHECK-SAME:       _then_func_name = "test_then_name"
-    // CHECK-SAME:       is_stateless = false
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
@@ -596,11 +596,11 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            "tf._XlaHostComputeMlir"
     // CHECK-SAME:       key = "if_predicate_channel_1"
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:            is_stateless = false
     // CHECK:              "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
     // CHECK-SAME:         recv_key = "host_compute_channel_0_retvals"
     // CHECK-SAME:         send_key = "host_compute_channel_0_args"
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
-    // CHECK:            is_stateless = false
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<2xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ({
         %3 = "tf.A"() : () -> (tensor<2xi32>)
@@ -637,13 +637,13 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:      key = "if_predicate_channel_1"
     // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:            _else_func_name = "test_else_name"
+    // CHECK-SAME        _then_func_name = "test_then_name"
     // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:         key = "host_compute_channel_0_args"
     // CHECK:              "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
     // CHECK-NOT:          "tf._XlaSendFromHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK:              "tf.Yield"() : () -> ()
-    // CHECK:            _else_func_name = "test_else_name"
-    // CHECK-SAME        _then_func_name = "test_then_name"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
@@ -1044,6 +1044,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:        %[[PROGRAM_OUTPUT:.+]] = "tf._XlaCompileMlirPlaceholderProgramKey"
     // CHECK-DAG:        %[[DEVICE_ORDINAL:.+]] = "tf._TPUDeviceOrdinalPlaceholder"
     // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK:            is_stateless = false
     // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-SAME:         key = "while_condition_channel_0"
     // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
@@ -1051,19 +1052,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"
     // CHECK:              "tf._XlaSendFromHostV2"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]], %[[DEVICE_ORDINAL]])
     // CHECK-NEXT:         "tf.Yield"
-    // CHECK:            is_stateless = false
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
     // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK:            is_stateless = false
     // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
     // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
     // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
     // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
     // CHECK-NEXT:         %[[HOST_COMPUTE_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"
     // CHECK-NEXT:         "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
-    // CHECK:            is_stateless = false
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<2xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ({
         %3 = "tf.A"() : () -> (tensor<3xf32>)
@@ -2118,34 +2118,34 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0",
     // CHECK:         "tf_device.launch"
     // CHECK:           %[[PROGRAM0:.+]] = "tf._XlaCompileMlirPlaceholderProgramKey"
     // CHECK:           %[[RECV0:.+]] = "tf._XlaRecvAtHost"(%[[PROGRAM0]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        device_ordinal = 0
     // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:           %[[B0:.+]] = "tf.OpB"(%[[RECV0]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
     // CHECK:           "tf._XlaSendFromHost"(%[[B0]], %[[PROGRAM0]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        device_ordinal = 0
     // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:         }, {
     // CHECK:           %[[PROGRAM1:.+]] = "tf._XlaCompileMlirPlaceholderProgramKey"
     // CHECK:           %[[RECV1:.+]] = "tf._XlaRecvAtHost"(%[[PROGRAM1]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        device_ordinal = 1
     // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:           %[[B1:.+]] = "tf.OpB"(%[[RECV1]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
     // CHECK:           "tf._XlaSendFromHost"(%[[B1]], %[[PROGRAM1]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        device_ordinal = 1
     // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:         }, {
     // CHECK:           "tf_device.cluster"
     // CHECK:             %[[A:.+]] = "tf.OpA"
-    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) <{dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []}> : (tensor<2x2xi64>) -> tensor<1x2xi64>
     // CHECK:             %[[B:.+]] = "tf._XlaHostComputeMlir"(%[[A_SHARD]])
     // CHECK-SAME:          manual_sharding = true
     // CHECK-SAME:          recv_key = "host_compute_channel_0_retvals"
     // CHECK-SAME:          send_key = "host_compute_channel_0_args"
-    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) <{dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []}> : (tensor<1x2xi64>) -> tensor<2x2xi64>
     // CHECK:             "tf.OpC"(%[[B_FULL]])
     "tf_device.cluster"() ({
       %0 = "tf.OpA"() {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01"} : () -> tensor<2x2xi64>
@@ -2178,32 +2178,32 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0",
     // CHECK:           %[[DEVICE0:.+]] = "tf._TPUDeviceOrdinalPlaceholder"
     // CHECK-SAME:        logical_core = 0
     // CHECK:           %[[RECV0:.+]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM0]], %[[DEVICE0]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:           %[[B0:.+]] = "tf.OpB"(%[[RECV0]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
     // CHECK:           "tf._XlaSendFromHostV2"(%[[B0]], %[[PROGRAM0]], %[[DEVICE0]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:         }, {
     // CHECK:           %[[PROGRAM1:.+]] = "tf._XlaCompileMlirPlaceholderProgramKey"
     // CHECK:           %[[DEVICE1:.+]] = "tf._TPUDeviceOrdinalPlaceholder"
     // CHECK-SAME:        logical_core = 1
     // CHECK:           %[[RECV1:.+]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM1]], %[[DEVICE1]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:           %[[B1:.+]] = "tf.OpB"(%[[RECV1]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
     // CHECK:           "tf._XlaSendFromHostV2"(%[[B1]], %[[PROGRAM1]], %[[DEVICE1]])
-    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:        _xla_has_host_transfer = true
     // CHECK:         }, {
     // CHECK:           "tf_device.cluster"
     // CHECK:             %[[A:.+]] = "tf.OpA"
-    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) <{dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []}> : (tensor<2x2xi64>) -> tensor<1x2xi64>
     // CHECK:             %[[B:.+]] = "tf._XlaHostComputeMlir"(%[[A_SHARD]])
     // CHECK-SAME:          manual_sharding = true
     // CHECK-SAME:          recv_key = "host_compute_channel_0_retvals"
     // CHECK-SAME:          send_key = "host_compute_channel_0_args"
-    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) <{dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []}> : (tensor<1x2xi64>) -> tensor<2x2xi64>
     // CHECK:             "tf.OpC"(%[[B_FULL]])
     tf_device.replicate() {n = 4 : i32} {
       "tf_device.cluster"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir b/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir
index 9e7b5b23eb48dd..5535badb06a06c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/fold-broadcast.mlir
@@ -48,7 +48,7 @@ func.func @broadcast_eq(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor<
   %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
   %1 = "tf.Equal"(%arg0, %0) {incompatible_shape_error = true} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xi1>
   func.return %1 : tensor<5x7xi1>
-  // CHECK: %[[V0:.*]] = "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = true} : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xi1>
+  // CHECK: %[[V0:.*]] = "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = true}> : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xi1>
   // CHECK: %[[V0]] : tensor<5x7xi1>
 }
 
@@ -58,7 +58,7 @@ func.func @broadcast_neq(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor
   %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
   %1 = "tf.NotEqual"(%arg0, %0) {incompatible_shape_error = true} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xi1>
   func.return %1 : tensor<5x7xi1>
-  // CHECK: %[[V0:.*]] = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = true} : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xi1>
+  // CHECK: %[[V0:.*]] = "tf.NotEqual"(%arg0, %arg1) <{incompatible_shape_error = true}> : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xi1>
   // CHECK: %[[V0]] : tensor<5x7xi1>
 }
 
@@ -79,7 +79,7 @@ func.func @broadcast_batch_matmul_v2_rhs(%arg0: tensor<17x17x17xf32>, %arg1: ten
   %0 = "tf.BroadcastTo"(%arg1, %cst) : (tensor<17x24xf32>, tensor<3xi64>) -> tensor<17x17x24xf32>
   %1 = "tf.BatchMatMulV2"(%arg0, %0) {adj_x = false, adj_y = false} : (tensor<17x17x17xf32>, tensor<17x17x24xf32>) -> tensor<17x17x24xf32>
   func.return %1 : tensor<17x17x24xf32>
-  // CHECK: %[[V0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<17x17x17xf32>, tensor<17x24xf32>) -> tensor<17x17x24xf32>
+  // CHECK: %[[V0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<17x17x17xf32>, tensor<17x24xf32>) -> tensor<17x17x24xf32>
   // CHECK: %[[V0]] : tensor<17x17x24xf32>
 }
 
@@ -89,7 +89,7 @@ func.func @broadcast_batch_matmul_v2_lhs(%arg0: tensor<17x17xf32>, %arg1: tensor
   %0 = "tf.BroadcastTo"(%arg0, %cst) : (tensor<17x17xf32>, tensor<3xi64>) -> tensor<17x17x17xf32>
   %1 = "tf.BatchMatMulV2"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<17x17x17xf32>, tensor<17x17x24xf32>) -> tensor<17x17x24xf32>
   func.return %1 : tensor<17x17x24xf32>
-  // CHECK: %[[V0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<17x17xf32>, tensor<17x17x24xf32>) -> tensor<17x17x24xf32>
+  // CHECK: %[[V0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<17x17xf32>, tensor<17x17x24xf32>) -> tensor<17x17x24xf32>
   // CHECK: %[[V0]] : tensor<17x17x24xf32>
 }
 
@@ -108,6 +108,6 @@ func.func @broadcast_splat_operand() -> tensor<5x5xi64> {
   %cst = arith.constant dense<5> : tensor<2xi64>
   %0 = "tf.BroadcastTo"(%cst, %cst) : (tensor<2xi64>, tensor<2xi64>) -> tensor<5x5xi64>
   func.return %0 : tensor<5x5xi64>
-  // CHECK: %[[V0:.*]] = "tf.Const"() {value = dense<5> : tensor<5x5xi64>} : () -> tensor<5x5xi64>
+  // CHECK: %[[V0:.*]] = "tf.Const"() <{value = dense<5> : tensor<5x5xi64>}> : () -> tensor<5x5xi64>
   // CHECK: %[[V0]] : tensor<5x5xi64>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
index 7c18a389de94b5..a458a20d49e510 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
@@ -464,7 +464,7 @@ module {
     func.return %arg0, %0 : tensor<?xf32>, tensor<0xf32>
   }
   // CHECK: func.func private @f_batch_callee(%[[ARG_0:.*]]: tensor<?xf32>) -> (tensor<?xf32>, tensor<0xf32>)
-  // CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<> : tensor<0xf32>.*}}} : () -> tensor<0xf32>
+  // CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<> : tensor<0xf32>}> : () -> tensor<0xf32>
   // CHECK: return %[[ARG_0]], %[[CST_0]] : tensor<?xf32>, tensor<0xf32>
 
   func.func @f(%arg: tensor<1xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
@@ -474,6 +474,6 @@ module {
   }
   // CHECK: func.func @f(%[[ARG_1:.*]]: tensor<1xf32>)
   // Make sure that `operandSegmentSizes` attribute is also updated.
-  // CHECK-NEXT: %[[BATCH_FUNC:.*]]:2 = "tf.BatchFunction"(%[[ARG_1]]) {{{.*operandSegmentSizes = array<i32: 1, 0>.*}}} : (tensor<1xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  // CHECK-NEXT: %[[BATCH_FUNC:.*]]:2 = "tf.BatchFunction"(%[[ARG_1]]) <{{{.*operandSegmentSizes = array<i32: 1, 0>.*}}}> : (tensor<1xf32>) -> (tensor<*xf32>, tensor<*xf32>)
   // CHECK: return %[[BATCH_FUNC]]#0, %[[BATCH_FUNC]]#1 : tensor<*xf32>, tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
index 4339cd725617fb..8deedc067f20fa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir
@@ -68,14 +68,14 @@ func.func @testIfCasts(%arg0: tensor<i1>, %arg1: tensor<!tf_type.variant<tensor<
 // CHECK:   [[PRED:%.+]] = tensor.extract [[TOBOOL]][] : tensor<i1>
 // CHECK:   cf.cond_br [[PRED]], ^bb1, ^bb2
 // CHECK: ^bb1:
-// CHECK:   [[CAST0:%.+]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant>
+// CHECK:   [[CAST0:%.+]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant>
 // CHECK:   [[THEN:%.+]] = call @testIfThen([[CAST0]]) : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
-// CHECK:   [[CAST1:%.+]] = "tf.Cast"([[THEN]]) {Truncate = false} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<f32>>>
+// CHECK:   [[CAST1:%.+]] = "tf.Cast"([[THEN]]) <{Truncate = false}> : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<f32>>>
 // CHECK:   cf.br ^bb3([[CAST1]] : tensor<!tf_type.variant<tensor<f32>>>)
 // CHECK: ^bb2:
-// CHECK:   [[CAST2:%.+]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant>
+// CHECK:   [[CAST2:%.+]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant>
 // CHECK:   [[ELSE:%.+]] = call @testIfElse([[CAST2]]) : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
-// CHECK:   [[CAST3:%.+]] = "tf.Cast"([[ELSE]]) {Truncate = false} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<f32>>>
+// CHECK:   [[CAST3:%.+]] = "tf.Cast"([[ELSE]]) <{Truncate = false}> : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<f32>>>
 // CHECK:   cf.br ^bb3([[CAST3]] : tensor<!tf_type.variant<tensor<f32>>>)
 // CHECK: ^bb3([[BBARG0:%.+]]: tensor<!tf_type.variant<tensor<f32>>>):
 // CHECK:   return [[BBARG0]] : tensor<!tf_type.variant<tensor<f32>>>
@@ -201,20 +201,20 @@ func.func @testWhileCasts(%arg0: tensor<!tf_type.variant<tensor<1x3xf32>>>) -> (
     cond = @testWhileCond, body = @testWhileBody, is_stateless = false
   } : (tensor<!tf_type.variant<tensor<1x3xf32>>>) -> (tensor<!tf_type.variant<tensor<*xf32>>>)
   func.return %0 : tensor<!tf_type.variant<tensor<*xf32>>>
-// CHECK:   [[CASTENTRY:%.+]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<!tf_type.variant<tensor<1x3xf32>>>) -> tensor<!tf_type.variant>
+// CHECK:   [[CASTENTRY:%.+]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<!tf_type.variant<tensor<1x3xf32>>>) -> tensor<!tf_type.variant>
 // CHECK:   cf.br ^bb1([[CASTENTRY]] : tensor<!tf_type.variant>)
 // CHECK: ^bb1([[CONDARG0:%.+]]: tensor<!tf_type.variant>):        // 2 preds: ^bb0, ^bb2
 // CHECK:   [[CONTINUE:%.+]] = call @testWhileCond([[CONDARG0]]) : (tensor<!tf_type.variant>) -> tensor<i1>
 // CHECK:   [[TOBOOL:%.+]] = "tf.ToBool"([[CONTINUE]]) : (tensor<i1>) -> tensor<i1>
 // CHECK:   [[PRED:%.+]] = tensor.extract [[TOBOOL]][] : tensor<i1>
-// CHECK:   [[CASTCONDARG0:%.+]] = "tf.Cast"([[CONDARG0]]) {Truncate = false} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<1x?xf32>>>
+// CHECK:   [[CASTCONDARG0:%.+]] = "tf.Cast"([[CONDARG0]]) <{Truncate = false}> : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant<tensor<1x?xf32>>>
 // CHECK:   cf.cond_br [[PRED]], ^bb2([[CASTCONDARG0]] : tensor<!tf_type.variant<tensor<1x?xf32>>>), ^bb3([[CASTCONDARG0]] : tensor<!tf_type.variant<tensor<1x?xf32>>>)
 // CHECK: ^bb2([[BODYARG0:%.+]]: tensor<!tf_type.variant<tensor<1x?xf32>>>):       // pred: ^bb1
 // CHECK:   [[WHILERET:%.+]] = call @testWhileBody([[BODYARG0]]) : (tensor<!tf_type.variant<tensor<1x?xf32>>>) -> tensor<!tf_type.variant<tensor<?x?xf32>>>
-// CHECK:   [[CASTWHILERET:%.+]] = "tf.Cast"([[WHILERET]]) {Truncate = false} : (tensor<!tf_type.variant<tensor<?x?xf32>>>) -> tensor<!tf_type.variant>
+// CHECK:   [[CASTWHILERET:%.+]] = "tf.Cast"([[WHILERET]]) <{Truncate = false}> : (tensor<!tf_type.variant<tensor<?x?xf32>>>) -> tensor<!tf_type.variant>
 // CHECK:   cf.br ^bb1([[CASTWHILERET]] : tensor<!tf_type.variant>)
 // CHECK: ^bb3([[EXITARG0:%.+]]: tensor<!tf_type.variant<tensor<1x?xf32>>>):       // pred: ^bb1
-// CHECK:   [[CASTEXITARG0:%.+]] = "tf.Cast"([[EXITARG0]]) {Truncate = false} : (tensor<!tf_type.variant<tensor<1x?xf32>>>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+// CHECK:   [[CASTEXITARG0:%.+]] = "tf.Cast"([[EXITARG0]]) <{Truncate = false}> : (tensor<!tf_type.variant<tensor<1x?xf32>>>) -> tensor<!tf_type.variant<tensor<*xf32>>>
 // CHECK:   return [[CASTEXITARG0]] : tensor<!tf_type.variant<tensor<*xf32>>>
 
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
index d426267f26ccd3..c5cf58971296fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -14,17 +14,17 @@ func.func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf3
   } : (tensor<i1>, tensor<*xf32>) -> tensor<*xf32>
 
   // CHECK: "tf.IfRegion"
+  // CHECK-SAME: <{_else_func_name = "testIf1Else"
+  // CHECK-SAME: _then_func_name = "testIf1Then"
+  // CHECK-SAME: is_stateless = false
   // CHECK: [[Result0:%.*]] = func.call @testIf1Then
   // CHECK: "tf.Yield"([[Result0]])
   // CHECK: [[Result1:%.*]] = func.call @testIf1Else
   // CHECK: "tf.Yield"([[Result1]])
   // CHECK: _attr0 = 10
   // CHECK-SAME: _attr1 = true
-  // CHECK-SAME: _else_func_name = "testIf1Else"
-  // CHECK-SAME: _then_func_name = "testIf1Then"
   // CHECK-NOT: attr2 =
   // CHECK-NOT: else_branch
-  // CHECK-SAME: is_stateless = false
   // CHECK-NOT: then_branch
   // CHECK-SAME: }
   func.return %0 : tensor<*xf32>
@@ -179,6 +179,7 @@ func.func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   } : (tensor<*xf32>) -> (tensor<*xf32>)
 
   // CHECK: [[Result0:%.*]] = "tf.WhileRegion"
+  // CHECK-SAME: is_stateless = true
   // CHECK: ^bb0([[CARG0:%[^:]*]]:
   // CHECK: [[Result1:%.*]] = func.call @testWhileCond
   // CHECK: "tf.Yield"([[Result1]], [[CARG0]])
@@ -189,7 +190,6 @@ func.func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   // CHECK-NOT: attr2 =
   // CHECK-NOT: cond =
   // CHECK-NOT: body =
-  // CHECK-SAME: is_stateless = true
   // CHECK: return [[Result0]]
   func.return %1 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/fused_kernel_matcher.mlir b/tensorflow/compiler/mlir/tensorflow/tests/fused_kernel_matcher.mlir
index 8458cb80df6983..38380ded0c93bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/fused_kernel_matcher.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/fused_kernel_matcher.mlir
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: conv2DBiasAdd_noActivation
 func.func @conv2DBiasAdd_noActivation(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) {TArgs = [f32], data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) <{data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {TArgs = [f32]} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_1:.*]] = "tf.Identity"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_1]]
   %0 = "tf.Conv2D"(%arg2, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
@@ -17,7 +17,7 @@ func.func @conv2DBiasAdd_noActivation(%arg0: tensor<128xf32>, %arg1: tensor<1x1x
 
 // CHECK-LABEL: conv2DBiasAdd_reluActivation
 func.func @conv2DBiasAdd_reluActivation(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) {TArgs = [f32], data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Relu"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) <{data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Relu"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {TArgs = [f32]} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_1:.*]] = "tf.Identity"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_1]]
   %0 = "tf.Conv2D"(%arg2, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
@@ -29,7 +29,7 @@ func.func @conv2DBiasAdd_reluActivation(%arg0: tensor<128xf32>, %arg1: tensor<1x
 
 // CHECK-LABEL: conv2DBiasAdd_relu6Activation
 func.func @conv2DBiasAdd_relu6Activation(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) {TArgs = [f32], data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Relu6"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) <{data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Relu6"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {TArgs = [f32]} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_1:.*]] = "tf.Identity"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_1]]
   %0 = "tf.Conv2D"(%arg2, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
@@ -41,7 +41,7 @@ func.func @conv2DBiasAdd_relu6Activation(%arg0: tensor<128xf32>, %arg1: tensor<1
 
 // CHECK-LABEL: conv2DBiasAdd_eluActivation
 func.func @conv2DBiasAdd_eluActivation(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) {TArgs = [f32], data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Elu"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_0:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) <{data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd", "Elu"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {TArgs = [f32]} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_1:.*]] = "tf.Identity"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_1]]
   %0 = "tf.Conv2D"(%arg2, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
@@ -64,7 +64,7 @@ func.func @conv2DBiasAdd_convMultipleUses(%arg0: tensor<128xf32>, %arg1: tensor<
 
 // CHECK-LABEL: conv2DBiasAdd_biasAddMultipleUse
 func.func @conv2DBiasAdd_biasAddMultipleUse(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
-  // CHECK-DAG: %[[VAL:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) {TArgs = [f32], data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
+  // CHECK-DAG: %[[VAL:.*]] = "tf._FusedConv2D"(%arg2, %arg1, %arg0) <{data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 0.000000e+00 : f32, explicit_paddings = [], fused_ops = ["BiasAdd"], num_args = 1 : i64, operandSegmentSizes = array<i32: 1, 1, 1, 0>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {TArgs = [f32]} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>, tensor<128xf32>) -> tensor<*xf32>
   // CHECK-DAG: %[[VAL_0:.*]] = "tf.Elu"(%[[VAL]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK-DAG: %[[VAL_1:.*]] = "tf.Identity"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK-DAG: %[[VAL_2:.*]] = "tf.Identity"(%[[VAL]]) : (tensor<*xf32>) -> tensor<*xf32>
@@ -89,7 +89,7 @@ func.func @conv2D_noFusion(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3x128xf32>,
 // CHECK-LABEL: conv2D_noFusion1
 func.func @conv2D_noFusion1(%arg0: tensor<*xf32>, %arg1: tensor<1x1x3x128xf32>, %arg2: tensor<8x32x32x3xf32>) -> (tensor<*xf32>) {
   // CHECK-NOT: "tf._FusedConv2D"
-  %0 = "tf.Conv2D"(%arg2, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
+  %0 = "tf.Conv2D"(%arg2, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<8x32x32x3xf32>, tensor<1x1x3x128xf32>) -> tensor<*xf32>
   // The result of the conv must be the first input to BiasAdd to be fusable.
   %1 = "tf.BiasAdd"(%arg0, %0) {data_format = "NHWC"} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   %2 = "tf.Elu"(%1) : (tensor<*xf32>) -> tensor<*xf32>
@@ -114,7 +114,7 @@ func.func @conv2D_dataFormatMismatch(%arg0: tensor<128xf32>, %arg1: tensor<1x1x3
 
 // CHECK-LABEL: matmulBiasAdd
 func.func @matmulBiasAdd(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) {epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd"], transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) <{epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd"], transpose_a = false, transpose_b = false}> : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_4:.*]] = "tf.Identity"(%[[VAL_3]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_4]]
   %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<*xf32>
@@ -125,7 +125,7 @@ func.func @matmulBiasAdd(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2:
 
 // CHECK-LABEL: matmulBiasAdd_relu
 func.func @matmulBiasAdd_relu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) {epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Relu"], transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) <{epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Relu"], transpose_a = false, transpose_b = false}> : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_4:.*]] = "tf.Identity"(%[[VAL_3]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_4]]
   %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<*xf32>
@@ -137,7 +137,7 @@ func.func @matmulBiasAdd_relu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %a
 
 // CHECK-LABEL: matmulBiasAdd_relu6
 func.func @matmulBiasAdd_relu6(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) {epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Relu6"], transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) <{epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Relu6"], transpose_a = false, transpose_b = false}> : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_4:.*]] = "tf.Identity"(%[[VAL_3]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_4]]
   %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<*xf32>
@@ -149,7 +149,7 @@ func.func @matmulBiasAdd_relu6(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %
 
 // CHECK-LABEL: matmulBiasAdd_elu
 func.func @matmulBiasAdd_elu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> (tensor<*xf32>) {
-  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) {epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Elu"], transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAL_3:.*]] = "tf._FusedMatMul"(%arg1, %arg2, %arg0) <{epsilon = 0.000000e+00 : f32, fused_ops = ["BiasAdd", "Elu"], transpose_a = false, transpose_b = false}> : (tensor<8x32xf32>, tensor<32x64xf32>, tensor<64xf32>) -> tensor<*xf32>
   // CHECK: %[[VAL_4:.*]] = "tf.Identity"(%[[VAL_3]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK: return %[[VAL_4]]
   %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
index c2f4d7aab5ca82..e820f366497473 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt
@@ -54,5 +54,5 @@ versions {
 # the names are matching between the function definition and the uses / call
 # site (a numerical suffix may be appended).
 
-# CHECK: "tf.LegacyCall"(%outputs) {_disable_call_shape_inference = false, device = "", f = @foo0}
+# CHECK: "tf.LegacyCall"(%outputs) <{_disable_call_shape_inference = false, f = @foo0}> {device = ""}
 # CHECK: func private @foo0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
index f954657765a56e..02dd85f3dc6a34 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt
@@ -68,4 +68,4 @@ library {
 }
 
 # CHECK: func @main
-# CHECK: "tf.LegacyCall"(%arg0) {_disable_call_shape_inference = true, _tpu_replicate = "cluster", device = "", f = @test_func_name0}
+# CHECK: "tf.LegacyCall"(%arg0) <{_disable_call_shape_inference = true, f = @test_func_name0}> {_tpu_replicate = "cluster", device = ""}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
index 4b937a17af82b8..7244b60cbd0570 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt
@@ -121,8 +121,8 @@ versions {
 # Verify that functions from the library are properly imported.
 
 # CHECK-LABEL:  func @main() {
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @foo110}
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @foo111}
+# CHECK:    "tf.LegacyCall"() <{_disable_call_shape_inference = false, f = @foo110}> {device = ""}
+# CHECK:    "tf.LegacyCall"() <{_disable_call_shape_inference = false, f = @foo111}> {device = ""}
 
 # CHECK-LABEL:  func private @foo110()
 # CHECK-LABEL:  func private @foo111()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index 66847dc63e904f..ea5957a666f2d5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -88,7 +88,7 @@ library {
 # CHECK:    tf_executor.graph
 # CHECK:      "tf.VarHandleOp"()
 # CHECK:      "tf.LegacyCall"
-# CHECK-SAME:   {_disable_call_shape_inference = true, device = "", f = @test_func_name0}
+# CHECK-SAME:   <{_disable_call_shape_inference = true, f = @test_func_name0}> {device = ""}
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func private @test_func_name0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
index eb593188888371..f515761b0921ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt
@@ -54,10 +54,10 @@ versions {
 # Verify that functions from the library are properly imported.
 
 # CHECK-LABEL:  func @main() {
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = true, device = "", f = @foo0}
-# CHECK:    "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @bar0}
+# CHECK:    "tf.LegacyCall"() <{_disable_call_shape_inference = true, f = @foo0}> {device = ""}
+# CHECK:    "tf.LegacyCall"() <{_disable_call_shape_inference = false, f = @bar0}> {device = ""}
 
 # CHECK-LABEL:  func private @foo0()
-# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, device = "", f = @bar0}
+# CHECK: "tf.LegacyCall"() <{_disable_call_shape_inference = false, f = @bar0}> {device = ""}
 
 # CHECK-LABEL:  func private @bar0()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
index fd33be7baaada2..868ee809fdd9a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
@@ -1,7 +1,7 @@
 # RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s | FileCheck %s
 
 # CHECK:"tf.MlirPassthroughOp"
-# CHECK: mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A return %ret : tensor<10x10xf32>\0A}\0A"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
+# CHECK: mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A return %ret : tensor<10x10xf32>\0A}\0A"}> {device = ""} : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
 
 node {
   name: "x"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
index 52f29badb19a2b..0dd181374d5909 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
@@ -73,9 +73,9 @@ module {
 // Test stateful and stateless partitioned calls.
 // CHECK-LABEL: func @f
 func.func @f() {
-  // CHECK: "tf.PartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
+  // CHECK: "tf.PartitionedCall"() <{config = "",  config_proto = "", executor_type = "", f = @g}> : () -> ()
   "tf.PartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
-  // CHECK: "tf.StatefulPartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @[[NEWG:.+]]} : () -> ()
+  // CHECK: "tf.StatefulPartitionedCall"() <{config = "",  config_proto = "", executor_type = "", f = @[[NEWG:.+]]}> : () -> ()
   "tf.StatefulPartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
   func.return
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
index f47330099fdc26..4b0ba861707096 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/inlining.mlir
@@ -71,8 +71,8 @@ func.func @dont_inline_custom_on_duplicated_cases() -> tensor<2xi32> {
 // CHECK-LABEL: func @inline_shape_cast(
 // CHECK-SAME:                          %[[ARG:.*]]: tensor<2xi32>
 func.func @inline_shape_cast(%arg: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: %[[ARG_CAST:.*]] = "tf.Cast"(%[[ARG]]) {Truncate = false} : (tensor<2xi32>) -> tensor<*xi32>
-  // CHECK-NEXT: %[[RESULT_CAST:.*]] = "tf.Cast"(%[[ARG_CAST]]) {Truncate = false} : (tensor<*xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[ARG_CAST:.*]] = "tf.Cast"(%[[ARG]]) <{Truncate = false}> : (tensor<2xi32>) -> tensor<*xi32>
+  // CHECK-NEXT: %[[RESULT_CAST:.*]] = "tf.Cast"(%[[ARG_CAST]]) <{Truncate = false}> : (tensor<*xi32>) -> tensor<2xi32>
   // CHECK-NEXT: return %[[RESULT_CAST]]
   %result = "tf.PartitionedCall"(%arg) {config = "", config_proto = "", executor_type = "", f = @inline_shape_cast_callee} : (tensor<2xi32>) -> tensor<2xi32>
   func.return %result : tensor<2xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
index c2dfdc65919583..37461838f4f85e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func.func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x7x7x8xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func.func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x7x7xf32>
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -38,7 +38,7 @@ func.func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x
 func.func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<?x?x?x?xf32>
 {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -49,7 +49,7 @@ func.func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<?x?x?x?xf32>
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -77,7 +77,7 @@ func.func @transposeConv2DBackpropFilter(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[IN_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
@@ -117,7 +117,7 @@ func.func @transposeConv2DBackpropInput(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D_BACKPROP:[0-9]*]] = "tf.Conv2DBackpropInput"
@@ -130,7 +130,7 @@ func.func @transposeConv2DBackpropInput(
   // CHECK-SAME: (tensor<4xi32>, tensor<1x1x3x8xf32>, tensor<1x8x32x32xf32>)
   // CHECK-SAME: -> tensor<1x3x32x32xf32>
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D_BACKPROP]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -154,7 +154,7 @@ func.func @transposeFusedBatchNormV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -164,7 +164,7 @@ func.func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -192,7 +192,7 @@ func.func @transposeFusedBatchNormGradV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
 
   // CHECK: %[[ARG0_TPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[ARG1_TPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
@@ -204,7 +204,7 @@ func.func @transposeFusedBatchNormGradV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>,
 
   // CHECK: %[[RES_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
 
   // CHECK: %[[RES_TPOSE:[0-9]*]] = "tf.Transpose"
   // CHECK-SAME: (%x_backprop, %[[RES_PERM]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
index 62749d185bfed5..b13da20bb674cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func.func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x8x7x6xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func.func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x
   // CHECK-SAME: strides = [5, 7, 8, 6]
   // CHECK-SAME: (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x7x6x8xf32>
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -41,7 +41,7 @@ func.func @transposeFusedBatchNormV3(
 ) -> tensor<1x64x28x28xf32> {
 
   // CHECK: %[[ARG_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -51,7 +51,7 @@ func.func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x28x28x64xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:.*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK-SAME: <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -74,10 +74,10 @@ func.func @transposeFusedBatchNormV3(
 // CHECK-LABEL: bias_add_nchw
 func.func @bias_add_nchw(%arg0: tensor<1x256x150x150xf32>, %arg1: tensor<256xf32>) -> tensor<1x256x150x150xf32> {
   // CHECK: (%[[ARG0:.*]]: tensor<1x256x150x150xf32>, %[[ARG1:.*]]: tensor<256xf32>)
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[R0:.*]] = "tf.Transpose"(%[[ARG0]], %[[CST]])
-  // CHECK: %[[R1:.*]] = "tf.BiasAdd"(%[[R0]], %[[ARG1]]) {data_format = "NHWC", device = ""}
-  // CHECK: %[[CST_0:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[R1:.*]] = "tf.BiasAdd"(%[[R0]], %[[ARG1]]) <{data_format = "NHWC"}> {device = ""}
+  // CHECK: %[[CST_0:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: "tf.Transpose"(%[[R1]], %[[CST_0]])
   %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW", device = ""} : (tensor<1x256x150x150xf32>, tensor<256xf32>) -> tensor<1x256x150x150xf32>
   func.return %0 : tensor<1x256x150x150xf32>
@@ -85,10 +85,10 @@ func.func @bias_add_nchw(%arg0: tensor<1x256x150x150xf32>, %arg1: tensor<256xf32
 
 // CHECK-LABEL: maxpool_nchw
 func.func @maxpool_nchw(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32> {
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}>
   // CHECK: %[[R0:.*]] = "tf.Transpose"(%arg0, %[[CST]])
-  // CHECK: %[[R1:.*]] = "tf.MaxPool"(%[[R0]]) {data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]}
-  // CHECK: %[[CST_0:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[R1:.*]] = "tf.MaxPool"(%[[R0]]) <{data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]}>
+  // CHECK: %[[CST_0:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}>
   // CHECK: "tf.Transpose"(%[[R1]], %[[CST_0]])
   %0 = "tf.MaxPool"(%arg0)
        {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index be36e2e13cb7db..be511f962e26a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: func @move_across_single_op
 func.func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: return %[[TANH]]
@@ -18,7 +18,7 @@ func.func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf
 // CHECK-LABEL: func @move_across_multiple_ops
 func.func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
@@ -36,7 +36,7 @@ func.func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x
 // CHECK-LABEL: func @move_across_multi_operand_op
 func.func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ARG0_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[ARG1_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[ARG0_TRANSPOSE]], %[[ARG1_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
@@ -52,7 +52,7 @@ func.func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tenso
 // CHECK-LABEL: func @move_with_multiple_uses
 func.func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[TANH]], %[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
@@ -78,9 +78,9 @@ func.func @move_transpose_handle_broadcast(%arg0:tensor<8x64xf32>, %arg1:tensor<
 
   func.return %3 : tensor<512x64xf32>
 
-  // CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() {value = dense<[512, 64]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  // CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() <{value = dense<[512, 64]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: %[[EXPAND_DIMS:.*]] = "tf.ExpandDims"(%arg0, %[[CST_1]]) {device = ""} : (tensor<8x64xf32>, tensor<i32>) -> tensor<8x64x1xf32>
   // CHECK: %[[TRANSPOSE_1:.*]] = "tf.Transpose"(%[[EXPAND_DIMS]], %[[CST_0]]) : (tensor<8x64x1xf32>, tensor<3xi32>) -> tensor<1x8x64xf32>
   // CHECK: %[[TRANSPOSE_2:.*]] = "tf.Transpose"(%arg1, %[[CST_0]]) : (tensor<8x64x64xf32>, tensor<3xi32>) -> tensor<64x8x64xf32>
@@ -97,7 +97,7 @@ func.func @dont_move_transpose_different_ranks(%arg0:tensor<1x1x2x3xf32>, %arg1:
 
   func.return %1 : tensor<1x2x1x3xf32>
 
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<1x1x2x3xf32>, tensor<2x3xf32>) -> tensor<1x1x2x3xf32>
   // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[ADD]], %[[CST]]) {device = ""} : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
   // CHECK: return %[[TRANSPOSE]] : tensor<1x2x1x3xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index 20bf6d65b9ab1d..0bc9a131cfab09 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL: func @move_across_single_op
 func.func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH]], %[[RES_PERM]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: return %[[RES_TRANSPOSE]]
@@ -19,7 +19,7 @@ func.func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf
 // CHECK-LABEL: func @move_across_multiple_ops
 func.func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[RELU]], %[[RES_PERM]])
@@ -36,7 +36,7 @@ func.func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x
 // CHECK-LABEL: func @move_across_multi_operand_op
 func.func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %arg1) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
@@ -52,7 +52,7 @@ func.func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tenso
 // CHECK-LABEL: func @move_across_broadcastable_op
 func.func @move_across_broadcastable_op(%arg0: tensor<1x4x1x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1x4x1x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
@@ -68,7 +68,7 @@ func.func @move_across_broadcastable_op(%arg0: tensor<1x4x1x8xf32>, %arg1: tenso
 // CHECK-LABEL: func @move_across_double_transpose
 func.func @move_across_double_transpose(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x4x8x4xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}>
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE_0:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
   // CHECK: %[[RES_TRANSPOSE_1:[0-9]*]] = "tf.Transpose"(%[[RES_TRANSPOSE_0]], %[[RES_PERM]])
@@ -90,8 +90,8 @@ func.func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56
   // MaxPool operand transpose must be folded into the op and MaxPool
   // must use NCHW data format with updated kernel size and strides.
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
-  // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"(%arg0) {data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "SAME", strides = [1, 1, 2, 2]} : (tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}>
+  // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"(%arg0) <{data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "SAME", strides = [1, 1, 2, 2]}> : (tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[MAX_POOL]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -112,14 +112,14 @@ func.func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56
 // CHECK-LABEL: func @fold_into_mean
 func.func @fold_into_mean(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64xf32> {
 
-  // CHECK: %[[RED_IDX:.*]] = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>}
+  // CHECK: %[[RED_IDX:.*]] = "tf.Const"() <{value = dense<[2, 3]> : tensor<2xi32>}>
   // CHECK: %[[MEAN:[0-9]*]] = "tf.Mean"(%arg0, %[[RED_IDX]])
   // CHECK-SAME: (tensor<1x64x112x112xf32>, tensor<2xi32>) -> tensor<1x64xf32>
   // CHECK: return %[[MEAN]]
 
-  // NOFOLD: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // NOFOLD: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}>
   // NOFOLD: %[[TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[CST]])
-  // NOFOLD: %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>}
+  // NOFOLD: %[[CST_1:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}>
   // NOFOLD: %[[MEAN:[0-9]*]] = "tf.Mean"(%[[TRANSPOSE]], %[[CST_1]])
   // NOFOLD-SAME: (tensor<1x112x112x64xf32>, tensor<2xi32>) -> tensor<1x64xf32>
   // NOFOLD: return %[[MEAN]]
@@ -138,8 +138,8 @@ func.func @fold_into_mean(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64xf32> {
 // CHECK-LABEL: func @fold_into_fused_batch_norm
 func.func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: tensor<64xf32>) -> tensor<1x112x112x64xf32> {
 
-  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
-  // CHECK: "tf.FusedBatchNormV3"(%arg0, {{.*}} {data_format = "NCHW"
+  // CHECK: %[[RES_PERM:.*]] = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}>
+  // CHECK: "tf.FusedBatchNormV3"(%arg0, {{.*}} <{data_format = "NCHW"
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -165,9 +165,9 @@ func.func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: te
 // CHECK-LABEL: func @fold_into_pad_with_extra_uses
 func.func @fold_into_pad_with_extra_uses(%arg0: tensor<1x2x4x4x3xf32>) -> (tensor<1x2x3x4x4xf32>, tensor<1x2x3x6x6xf32>) {
 
-  // CHECK: %[[PERM:.*]] = "tf.Const"() {value = dense<[0, 1, 4, 2, 3]> : tensor<5xi32>}
+  // CHECK: %[[PERM:.*]] = "tf.Const"() <{value = dense<[0, 1, 4, 2, 3]> : tensor<5xi32>}>
   // CHECK: %[[TRANSPOSE_OP:[0-9]*]] = "tf.Transpose"(%arg0, %[[PERM]])
-  // CHECK: %[[PADDING:.*]] = "tf.Const"() {value = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<5x2xi32>}
+  // CHECK: %[[PADDING:.*]] = "tf.Const"() <{value = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<5x2xi32>}>
   // CHECK: %[[PAD_OP:[0-9]*]] = "tf.Pad"(%arg0, %[[PADDING]])
   // CHECK: %[[DUP_TRANSPOSE_OP:[0-9]*]] = "tf.Transpose"(%[[PAD_OP]], %[[PERM]])
   // CHECK: return %[[TRANSPOSE_OP]], %[[DUP_TRANSPOSE_OP]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir
index 5f9256f6424ab1..e82819686948d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir
@@ -32,7 +32,7 @@ func.func @transpose_resnet_layer(%arg0: tensor<?x224x224x3xf32>, // input
   // Shuffled paddings.
   // CHECK: %[[PADDINGS:.*]] = "tf.Const"(){{.*}}[0, 0], [3, 3], [3, 3], [0, 0]
   // NOFOLD: %[[PADDING:.*]] = "tf.Const"(){{.*}}[0, 0], [0, 0], [3, 3], [3, 3]
-  // NOFOLD: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // NOFOLD: %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // NOFOLD: %[[TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[CST]]) : (tensor<?x224x224x3xf32>, tensor<4xi32>) -> tensor<?x3x224x224xf32>
 
   // Pad input with new paddings.
@@ -151,7 +151,7 @@ func.func @transpose_resnet_layer(%arg0: tensor<?x224x224x3xf32>, // input
   %16 = "tf.Mean"(%15, %1) : (tensor<?x256x56x56xf32>, tensor<2xi32>) -> tensor<?x256xf32>
 
   // Mean should compute reduction over NHWC spatial dimensions.
-  // CHECK: %[[MEAN_DIMS:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>}
+  // CHECK: %[[MEAN_DIMS:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}>
   // CHECK: %[[MEAN:[0-9]*]] = "tf.Mean"(%[[RELU]], %[[MEAN_DIMS]])
   // CHECK-SAME: (tensor<?x56x56x256xf32>, tensor<2xi32>) -> tensor<?x256xf32>
   // CHECK: return %[[MEAN]] : tensor<?x256xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg.mlir
index c170a0e41c426f..0ff3cd334fc2aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg.mlir
@@ -4,9 +4,9 @@
 module  {
   // CHECK: tf_executor.graph
   tfg.graph #tf_type.version<producer = 919, min_consumer = 12> {
-    // CHECK: tf_executor.island wraps "tf.VarHandleOp"() {_mlir_name = "x", _output_shapes = [#tf_type.shape<>], allowed_devices = [], container = "a", device = "/device:CPU:0", dtype = i64, shape = #tf_type.shape<>, shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i64>>>
+    // CHECK: tf_executor.island wraps "tf.VarHandleOp"() <{container = "a", shared_name = "x"}> {_mlir_name = "x", _output_shapes = [#tf_type.shape<>], allowed_devices = [], device = "/device:CPU:0", dtype = i64, shape = #tf_type.shape<>} : () -> tensor<!tf_type.resource<tensor<i64>>>
     %VarHandleOp, %ctl = VarHandleOp device("/CPU:0") name("x") {_output_shapes = [#tf_type.shape<>], allowed_devices = [], container = "a", dtype = i64, shape = #tf_type.shape<>, shared_name = "x"} : () -> (tensor<!tf_type.resource<tensor<i64>>>)
-    // CHECK: tf_executor.island wraps "tf.LegacyCall"(%outputs, %outputs) {_disable_call_shape_inference = true, f = @test_func_name0} : (tensor<!tf_type.resource<tensor<i64>>>, tensor<!tf_type.resource<tensor<i64>>>) -> tensor<*x!tf_type.resource>
+    // CHECK: tf_executor.island wraps "tf.LegacyCall"(%outputs, %outputs) <{_disable_call_shape_inference = true, f = @test_func_name0}> : (tensor<!tf_type.resource<tensor<i64>>>, tensor<!tf_type.resource<tensor<i64>>>) -> tensor<*x!tf_type.resource>
     %test_func_name0, %ctl_0 = test_func_name0(%VarHandleOp, %VarHandleOp) name("called") {_disable_call_shape_inference = true, _output_shapes = [#tf_type.shape<*>]} : (tensor<!tf_type.resource<tensor<i64>>>, tensor<!tf_type.resource<tensor<i64>>>) -> (tensor<*x!tf_type.resource>)
     // CHECK: tf_executor.island wraps "tf._Retval"(%outputs_0) {T = !tf_type.resource, _mlir_name = "func_call", index = 0 : i64} : (tensor<*x!tf_type.resource>) -> ()
     %ctl_1 = _Retval(%test_func_name0) name("func_call") {T = !tf_type.resource, index = 0 : i64} : tensor<*x!tf_type.resource>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_quantized.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_quantized.mlir
index 11c9704ee0074f..eedc235b10f16e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_quantized.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_quantized.mlir
@@ -2,9 +2,9 @@
 
 // CHECK-LABEL: dequantize
 func.func @dequantize(%arg0: tensor<2x3x!tf_type.qint8>, %min_range: tensor<f32>, %max_range: tensor<f32>) -> tensor<2x3xf32> {
-  // CHECK-DAG: %[[HALF_RANGE:.*]] = "tf.Const"() {value = dense<1.280000e+02> : tensor<f32>}
-  // CHECK-DAG: %[[C255:.*]] = "tf.Const"() {value = dense<2.550000e+02> : tensor<f32>}
-  // CHECK-DAG: %[[CAST:.*]] = "tf.Cast"(%arg0) {Truncate = false}
+  // CHECK-DAG: %[[HALF_RANGE:.*]] = "tf.Const"() <{value = dense<1.280000e+02> : tensor<f32>}>
+  // CHECK-DAG: %[[C255:.*]] = "tf.Const"() <{value = dense<2.550000e+02> : tensor<f32>}>
+  // CHECK-DAG: %[[CAST:.*]] = "tf.Cast"(%arg0) <{Truncate = false}>
   // CHECK-DAG: %[[SHIFT:.*]] = "tf.AddV2"(%[[CAST]], %[[HALF_RANGE]])
   // CHECK-DAG: %[[DRANGE:.*]] = "tf.Sub"(%arg2, %arg1)
   // CHECK-DAG: %[[SCALE:.*]] = "tf.Div"(%[[DRANGE]], %[[C255:.*]])
@@ -18,8 +18,8 @@ func.func @dequantize(%arg0: tensor<2x3x!tf_type.qint8>, %min_range: tensor<f32>
 
 // CHECK-LABEL: dequantize_quint8
 func.func @dequantize_quint8(%arg0: tensor<2x3x!tf_type.quint8>, %min_range: tensor<f32>, %max_range: tensor<f32>) -> tensor<2x3xf32> {
-  // CHECK-NEXT: %[[C255:.*]] = "tf.Const"() {value = dense<2.550000e+02> : tensor<f32>}
-  // CHECK-NEXT: %[[CAST:.*]] = "tf.Cast"(%arg0) {Truncate = false}
+  // CHECK-NEXT: %[[C255:.*]] = "tf.Const"() <{value = dense<2.550000e+02> : tensor<f32>}>
+  // CHECK-NEXT: %[[CAST:.*]] = "tf.Cast"(%arg0) <{Truncate = false}>
   // CHECK-NEXT: %[[DRANGE:.*]] = "tf.Sub"(%arg2, %arg1)
   // CHECK-NEXT: %[[SCALE:.*]] = "tf.Div"(%[[DRANGE]], %[[C255:.*]])
   // CHECK-NEXT: %[[SS:.*]] = "tf.Mul"(%[[CAST]], %[[SCALE]])
@@ -32,15 +32,15 @@ func.func @dequantize_quint8(%arg0: tensor<2x3x!tf_type.quint8>, %min_range: ten
 
 // CHECK-LABEL: dequantize_to_bf16
 func.func @dequantize_to_bf16(%arg0: tensor<2x3x!tf_type.qint8>, %min_range: tensor<f32>, %max_range: tensor<f32>) -> tensor<2x3xbf16> {
-  // CHECK-DAG: %[[HALF_RANGE:.*]] = "tf.Const"() {value = dense<1.280000e+02> : tensor<f32>}
-  // CHECK-DAG: %[[C255:.*]] = "tf.Const"() {value = dense<2.550000e+02> : tensor<f32>}
-  // CHECK-DAG: %[[CAST:.*]] = "tf.Cast"(%arg0) {Truncate = false}
+  // CHECK-DAG: %[[HALF_RANGE:.*]] = "tf.Const"() <{value = dense<1.280000e+02> : tensor<f32>}>
+  // CHECK-DAG: %[[C255:.*]] = "tf.Const"() <{value = dense<2.550000e+02> : tensor<f32>}>
+  // CHECK-DAG: %[[CAST:.*]] = "tf.Cast"(%arg0) <{Truncate = false}>
   // CHECK-DAG: %[[SHIFT:.*]] = "tf.AddV2"(%[[CAST]], %[[HALF_RANGE]])
   // CHECK-DAG: %[[DRANGE:.*]] = "tf.Sub"(%arg2, %arg1)
   // CHECK-DAG: %[[SCALE:.*]] = "tf.Div"(%[[DRANGE]], %[[C255:.*]])
   // CHECK-DAG: %[[SS:.*]] = "tf.Mul"(%[[SHIFT]], %[[SCALE]])
   // CHECK-DAG: %[[F32_RESULT:.*]] = "tf.AddV2"(%[[SS]], %arg1)
-  // CHECK-DAG: %[[RESULT:.*]] = "tf.Cast"(%[[F32_RESULT]]) {Truncate = false}
+  // CHECK-DAG: %[[RESULT:.*]] = "tf.Cast"(%[[F32_RESULT]]) <{Truncate = false}>
   %0 = "tf.Dequantize"(%arg0, %min_range, %max_range) : (tensor<2x3x!tf_type.qint8>, tensor<f32>, tensor<f32>) -> tensor<2x3xbf16>
 
   // CHECK-DAG: return %[[RESULT]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index 432195aed3b4d3..83f0b56e398c17 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -2,11 +2,11 @@
 
 // CHECK-LABEL: invert_permutation
 func.func @invert_permutation(%arg0: tensor<5xi32>) -> tensor<5xi32> {
-  // CHECK-DAG: %[[UPDATES:.*]] = "tf.Const"() {value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[5, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() {value = dense<1> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-DAG: %[[cst_3:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-DAG: %[[UPDATES:.*]] = "tf.Const"() <{value = dense<[0, 1, 2, 3, 4]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 1]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  // CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-DAG: %[[cst_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
 
   // CHECK-DAG: %[[INDICES:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<5xi32>, tensor<2xi32>) -> tensor<5x1xi32>
   // CHECK-DAG: %[[INDICES_1:.*]] = "tf.TensorScatterAdd"(%[[cst_3]], %[[INDICES]], %[[cst_2]]) : (tensor<5xi32>, tensor<5x1xi32>, tensor<5xi32>) -> tensor<5xi32>
@@ -35,7 +35,7 @@ func.func @invert_permutation_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // CHECK-LABEL: simple_pack
 // CHECK-SAME: %[[ARG0:.*]]: tensor<3x5xf32>, %[[ARG1:.*]]: tensor<3x5xf32>
 func.func @simple_pack(%arg0: tensor<3x5xf32>, %arg1: tensor<3x5xf32>) -> tensor<2x3x5xf32> {
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}>
   // CHECK: %[[INP0:.*]] = "tf.ExpandDims"(%[[ARG0]], %[[AXIS]]) : (tensor<3x5xf32>, tensor<i64>) -> tensor<1x3x5xf32>
   // CHECK: %[[INP1:.*]] = "tf.ExpandDims"(%[[ARG1]], %[[AXIS]]) : (tensor<3x5xf32>, tensor<i64>) -> tensor<1x3x5xf32>
   // CHECK: "tf.ConcatV2"(%[[INP0]], %[[INP1]], %[[AXIS]]) : (tensor<1x3x5xf32>, tensor<1x3x5xf32>, tensor<i64>) -> tensor<2x3x5xf32>
@@ -71,8 +71,8 @@ func.func @squared_difference_complex(%arg0: tensor<3xcomplex<f32>>, %arg1: tens
 // CHECK-LABEL: func @div_no_nan
 // CHECK-SAME: (%[[X:.*]]: tensor<*xf32>, %[[Y:.*]]: tensor<*xf32>)
 func.func @div_no_nan(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[Y]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[Y]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
   // CHECK:  %[[DIV:.*]] = "tf.Div"(%[[X]], %[[Y]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO]], %[[DIV]]) : (tensor<*xi1>, tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tf.DivNoNan"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -96,7 +96,7 @@ func.func @truncate_div_int(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)
 // CHECK-SAME: (%[[LHS:.*]]: tensor<*xf32>, %[[RHS:.*]]: tensor<*xf32>)
 func.func @truncate_div_float(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>)
     -> tensor<*xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK:  %[[XDIVY:.*]] = "tf.Div"(%[[LHS]], %[[RHS]])
   // CHECK:  %[[MASK:.*]] = "tf.Less"(%[[XDIVY]], %[[ZERO]])
   // CHECK:  %[[CEIL:.*]] = "tf.Ceil"(%[[XDIVY]])
@@ -112,8 +112,8 @@ func.func @truncate_div_float(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>)
 // CHECK-LABEL: func @mul_no_nan
 // CHECK-SAME: (%[[X:.*]]: tensor<2x3xf32>, %[[Y:.*]]: tensor<3xf32>)
 func.func @mul_no_nan(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2x3xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[Y]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<3xf32>, tensor<f32>) -> tensor<3xi1>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[Y]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<3xf32>, tensor<f32>) -> tensor<3xi1>
   // CHECK:  %[[MUL:.*]] = "tf.Mul"(%[[X]], %[[Y]]) : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2x3xf32>
   // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO]], %[[MUL]]) : (tensor<3xi1>, tensor<f32>, tensor<2x3xf32>) -> tensor<2x3xf32>
   %0 = "tf.MulNoNan"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2x3xf32>
@@ -124,9 +124,9 @@ func.func @mul_no_nan(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2x
 
 // CHECK-LABEL: @is_inf
 func.func @is_inf(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
-  // CHECK: %[[INF:.*]] = "tf.Const"() {value = dense<0x7F800000> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[INF:.*]] = "tf.Const"() <{value = dense<0x7F800000> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[ABS:.*]] = "tf.Abs"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[ABS]], %[[INF]]) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
+  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[ABS]], %[[INF]]) <{incompatible_shape_error = true}> : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
   %0 = "tf.IsInf"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xi1>
   // CHECK: return %[[RESULT]]
   func.return %0 : tensor<3x4xi1>
@@ -134,7 +134,7 @@ func.func @is_inf(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
 
 // CHECK-LABEL: @is_nan
 func.func @is_nan(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
-  // CHECK: %[[RESULT:.*]] = "tf.NotEqual"(%arg0, %arg0) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xi1>
+  // CHECK: %[[RESULT:.*]] = "tf.NotEqual"(%arg0, %arg0) <{incompatible_shape_error = true}> : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xi1>
   %0 = "tf.IsNan"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xi1>
   // CHECK: return %[[RESULT]]
   func.return %0 : tensor<3x4xi1>
@@ -149,7 +149,7 @@ func.func @fill(%arg0: tensor<*xi64>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
 }
 
 func.func @empty(%arg0: tensor<?xi32>) -> tensor<*xf32> {
-  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
   // CHECK-DAG: [[RES:%.+]] = "tf.BroadcastTo"([[CST]], %arg0)
   %0 = "tf.Empty"(%arg0) {init = true} : (tensor<?xi32>) -> (tensor<*xf32>)
 
@@ -162,9 +162,9 @@ func.func @empty(%arg0: tensor<?xi32>) -> tensor<*xf32> {
 func.func @l2_loss(%arg0: tensor<?x?xf32>) -> tensor<f32> {
 
   // CHECK-DAG: %[[SQUARE:.*]] = "tf.Mul"(%[[INPUT]], %[[INPUT]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[REDUCE_AXES:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[SQUARE]], %[[REDUCE_AXES]]) {keep_dims = false} : (tensor<?x?xf32>, tensor<2xi64>) -> tensor<f32>
-  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>}
+  // CHECK-DAG: %[[REDUCE_AXES:.*]] = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[SQUARE]], %[[REDUCE_AXES]]) <{keep_dims = false}> : (tensor<?x?xf32>, tensor<2xi64>) -> tensor<f32>
+  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}>
   // CHECK-DAG: %[[LOSS:.*]] = "tf.Div"(%[[SUM]], %[[TWO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
 
   %0 = "tf.L2Loss"(%arg0) : (tensor<?x?xf32>) -> tensor<f32>
@@ -183,7 +183,7 @@ func.func @l2_loss_unranked(%arg0: tensor<*xf32>) -> tensor<f32> {
 // CHECK-LABEL: pack_with_unranked
 // CHECK-SAME: %[[ARG0:.*]]: tensor<?x5xf32>, %[[ARG1:.*]]: tensor<*xf32>
 func.func @pack_with_unranked(%arg0: tensor<?x5xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<-2> : tensor<i64>}
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<-2> : tensor<i64>}>
   // CHECK: %[[INP0:.*]] = "tf.ExpandDims"(%[[ARG0]], %[[AXIS]]) : (tensor<?x5xf32>, tensor<i64>) -> tensor<?x1x5xf32>
   // CHECK: %[[INP1:.*]] = "tf.ExpandDims"(%[[ARG1]], %[[AXIS]]) : (tensor<*xf32>, tensor<i64>) -> tensor<*xf32>
   // CHECK: "tf.ConcatV2"(%[[INP0]], %[[INP1]], %[[AXIS]]) : (tensor<?x1x5xf32>, tensor<*xf32>, tensor<i64>) -> tensor<*xf32>
@@ -196,7 +196,7 @@ func.func @pack_with_unranked(%arg0: tensor<?x5xf32>, %arg1: tensor<*xf32>) -> t
 func.func @pad(%arg0: tensor<3xf32>) -> tensor<6xf32> {
   %padding = "tf.Const"() { value = dense<[[1, 2]]> : tensor<1x2xi64> } : () -> tensor<1x2xi64>
   // CHECK-DAG: [[PAD:%.+]] = "tf.Const"() {{.+}} -> tensor<1x2xi64>
-  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
   // CHECK: "tf.PadV2"(%arg0, [[PAD]], [[CST]])
   %0 = "tf.Pad"(%arg0, %padding) : (tensor<3xf32>, tensor<1x2xi64>) -> tensor<6xf32>
   func.return %0 : tensor<6xf32>
@@ -206,7 +206,7 @@ func.func @pad(%arg0: tensor<3xf32>) -> tensor<6xf32> {
 func.func @pad_bf16(%arg0: tensor<3xbf16>) -> tensor<6xbf16> {
   %padding = "tf.Const"() { value = dense<[[1, 2]]> : tensor<1x2xi64> } : () -> tensor<1x2xi64>
   // CHECK-DAG: [[PAD:%.+]] = "tf.Const"() {{.+}}  -> tensor<1x2xi64>
-  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<bf16>}
+  // CHECK-DAG: [[CST:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<bf16>}>
   // CHECK: "tf.PadV2"(%arg0, [[PAD]], [[CST]])
   %0 = "tf.Pad"(%arg0, %padding) : (tensor<3xbf16>, tensor<1x2xi64>) -> tensor<6xbf16>
   func.return %0 : tensor<6xbf16>
@@ -221,8 +221,8 @@ func.func @add_f32(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xf32>
 
 // CHECK-LABEL: func @BiasAddGrad_NHWC
 func.func @BiasAddGrad_NHWC(%arg0: tensor<2x3x4x5xf32>) -> tensor<5xf32> {
-  // CHECK: "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>}
-  // CHECK: "tf.Sum"({{.*}}) {keep_dims = false}
+  // CHECK: "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}>
+  // CHECK: "tf.Sum"({{.*}}) <{keep_dims = false}>
 
   %0 = "tf.BiasAddGrad"(%arg0) {data_format = "NHWC"} : (tensor<2x3x4x5xf32>) -> tensor<5xf32>
   func.return %0 : tensor<5xf32>
@@ -230,8 +230,8 @@ func.func @BiasAddGrad_NHWC(%arg0: tensor<2x3x4x5xf32>) -> tensor<5xf32> {
 
 // CHECK-LABEL: func @BiasAddGrad_NCHW
 func.func @BiasAddGrad_NCHW(%arg0: tensor<2x3x4x5xf32>) -> tensor<3xf32> {
-  // CHECK: "tf.Const"() {value = dense<[0, 2, 3]> : tensor<3xi64>}
-  // CHECK: "tf.Sum"({{.*}}) {keep_dims = false}
+  // CHECK: "tf.Const"() <{value = dense<[0, 2, 3]> : tensor<3xi64>}>
+  // CHECK: "tf.Sum"({{.*}}) <{keep_dims = false}>
 
   %0 = "tf.BiasAddGrad"(%arg0) {data_format = "NCHW"} : (tensor<2x3x4x5xf32>) -> tensor<3xf32>
   func.return %0 : tensor<3xf32>
@@ -254,7 +254,7 @@ func.func @BiasAddGrad_unranked(%arg0: tensor<*xf32>) -> tensor<?xf32> {
 // CHECK-LABEL: func @rsqrt_grad
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2xf32>, %[[ARG1:.*]]: tensor<2xf32>)
 func.func @rsqrt_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<-2.000000e+00> : tensor<f32>}
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<-2.000000e+00> : tensor<f32>}>
   // CHECK: %[[LHS2:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG0]])
   // CHECK: %[[LHS3:.*]] = "tf.Mul"(%[[LHS2]], %[[ARG0]])
   // CHECK: %[[DIV:.*]] = "tf.Div"(%[[ARG1]], %[[CST]])
@@ -279,7 +279,7 @@ func.func @rsqrt_grad_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> te
 // CHECK-LABEL: func @sqrt_grad_unranked
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<*xcomplex<f32>>, %[[ARG1:.*]]: tensor<*xcomplex<f32>>)
 func.func @sqrt_grad_unranked(%arg0: tensor<*xcomplex<f32>>, %arg1: tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>> {
-  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<(5.000000e-01,0.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<(5.000000e-01,0.000000e+00)> : tensor<complex<f32>>}> : () -> tensor<complex<f32>>
   // CHECK: %[[MUL:.*]] = "tf.Mul"(%arg1, %[[CST]]) : (tensor<*xcomplex<f32>>, tensor<complex<f32>>) -> tensor<*xcomplex<f32>>
   // CHECK: %[[RET:.*]] = "tf.Div"(%[[MUL]], %arg0) : (tensor<*xcomplex<f32>>, tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
 
@@ -292,22 +292,22 @@ func.func @sqrt_grad_unranked(%arg0: tensor<*xcomplex<f32>>, %arg1: tensor<*xcom
 // dimension.
 // CHECK-LABEL: fourdim_space_to_batch_nd
 func.func @fourdim_space_to_batch_nd(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
-  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"() {value = dense<0> : tensor<1x2xi64>}
-  // CHECK-DAG: [[ZERO_I32:%.+]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: [[ZERO_I64:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"() <{value = dense<0> : tensor<1x2xi64>}>
+  // CHECK-DAG: [[ZERO_I32:%.+]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: [[ZERO_I64:%.+]] = "tf.Const"() <{value = dense<0> : tensor<i64>}>
   // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[ZERO_I64]])
-  // CHECK-DAG: [[PAD_DEFAULT:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[PAD_DEFAULT:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
   // CHECK-DAG: [[PADDED:%.+]] = "tf.PadV2"(%arg0, [[FULL_PADDINGS]], [[PAD_DEFAULT]])
-  // CHECK-DAG: [[PADDINGS:%.+]]:2 = "tf.Unpack"([[FULL_PADDINGS]]) {axis = 1 : i64}
+  // CHECK-DAG: [[PADDINGS:%.+]]:2 = "tf.Unpack"([[FULL_PADDINGS]]) <{axis = 1 : i64}>
   // CHECK-DAG: [[PADDINGS_SUM:%.+]] = "tf.AddV2"([[PADDINGS]]#0, [[PADDINGS]]#1)
-  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 10]> : tensor<4xi64>}
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() <{value = dense<[3, 5, 7, 10]> : tensor<4xi64>}>
   // CHECK-DAG: [[PADDED_SHAPE:%.+]] = "tf.AddV2"([[PADDINGS_SUM]], [[INPUT_SHAPE]])
   // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:4 = "tf.Split"([[ZERO_I32]], [[PADDED_SHAPE]])
   // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:2 = "tf.Split"([[ZERO_I32]], %arg1)
   // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
   // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
   // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
-  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}>
   // CHECK-DAG: [[OUTPUT_BATCH_PART:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
   // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART]], [[BLOCK_SHAPE_SPLITS]]#1)
   // CHECK-DAG: [[OUTPUT_SHAPE:%.+]] = "tf.ConcatV2"([[OUTPUT_BATCH]], [[OUTER_SHAPE_0]], [[OUTER_SHAPE_1]], [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
@@ -336,11 +336,11 @@ func.func @const_paddings_space_to_batch_nd(%arg0: tensor<1x8x2xf32>) -> (tensor
   %1 = "tf.Const"() {value = dense<[[3, 4]]> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
 
 
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<[3, 5, 2]> : tensor<3xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<[1, 5, 3, 2]> : tensor<4xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<{{\[\[}}0, 0], [3, 4], [0, 0{{\]\]}}> : tensor<3x2xi64>}
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
-  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi64>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<[3, 5, 2]> : tensor<3xi64>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<[1, 5, 3, 2]> : tensor<4xi64>}>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<{{\[\[}}0, 0], [3, 4], [0, 0{{\]\]}}> : tensor<3x2xi64>}>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}>
   // CHECK-DAG: [[VAL5:%.+]] = "tf.PadV2"(%arg0, [[VAL2]], [[VAL3]])
   // CHECK-SAME: tensor<1x15x2xf32>
   // CHECK-DAG: [[VAL6:%.+]] = "tf.Reshape"([[VAL5]], [[VAL1]])
@@ -368,14 +368,14 @@ func.func @avoid_lowering_space_to_batch_nd(%arg0: tensor<1x8x2xf32>, %arg1: ten
 func.func @sixdim_space_to_batch_nd(%input: tensor<3x5x7x9x10x11xf32>, %block_shape: tensor<3xi64>, %paddings: tensor<3x2xi64>) -> tensor<?x?x?x?x10x11xf32> {
   // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"()
   // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[PAD00]], {{.+}})
-  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 9, 10, 11]> : tensor<6xi64>}
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() <{value = dense<[3, 5, 7, 9, 10, 11]> : tensor<6xi64>}>
   // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:6 = "tf.Split"
   // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:3 = "tf.Split"
   // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
   // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
   // CHECK-DAG: [[OUTER_SHAPE_2:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#3, [[BLOCK_SHAPE_SPLITS]]#2)
   // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[OUTER_SHAPE_2]], [[BLOCK_SHAPE_SPLITS]]#2, [[PADDED_SHAPE_SPLITS]]#4, [[PADDED_SHAPE_SPLITS]]#5, {{.+}})
-  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 6, 0, 1, 3, 5, 7, 8]> : tensor<9xi64>}
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() <{value = dense<[2, 4, 6, 0, 1, 3, 5, 7, 8]> : tensor<9xi64>}>
   // CHECK-DAG: [[OUTPUT_BATCH_PART1:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
   // CHECK-DAG: [[OUTPUT_BATCH_PART2:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART1]], [[BLOCK_SHAPE_SPLITS]]#1)
   // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART2]], [[BLOCK_SHAPE_SPLITS]]#2)
@@ -386,11 +386,11 @@ func.func @sixdim_space_to_batch_nd(%input: tensor<3x5x7x9x10x11xf32>, %block_sh
 
 // CHECK-LABEL: func @batchToSpace
 func.func @batchToSpace(%arg0: tensor<3x5x2xf32>) -> (tensor<1x8x2xf32>) {
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<[3, 1, 5, 2]> : tensor<4xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<[1, 2, 0, 3]> : tensor<4xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<[1, 15, 2]> : tensor<3xi64>}
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<[0, 3, 0]> : tensor<3xi64>}
-  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<[1, 8, 2]> : tensor<3xi64>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<[3, 1, 5, 2]> : tensor<4xi64>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<[1, 2, 0, 3]> : tensor<4xi64>}>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<[1, 15, 2]> : tensor<3xi64>}>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<[0, 3, 0]> : tensor<3xi64>}>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() <{value = dense<[1, 8, 2]> : tensor<3xi64>}>
   // CHECK-DAG: [[VAL5:%.+]] = "tf.Reshape"(%arg0, [[VAL0]])
   // CHECK-DAG: [[VAL6:%.+]] = "tf.Transpose"([[VAL5]], [[VAL1]])
   // CHECK-DAG: [[VAL7:%.+]] = "tf.Reshape"([[VAL6]], [[VAL2]])
@@ -404,11 +404,11 @@ func.func @batchToSpace(%arg0: tensor<3x5x2xf32>) -> (tensor<1x8x2xf32>) {
 }
 
 func.func @fake_quant_with_min_max_args(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<1.275000e+02> : tensor<f32>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<1.00392163> : tensor<f32>}
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<-0.996078491> : tensor<f32>}
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<0.00784313772> : tensor<f32>}
-  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<1.275000e+02> : tensor<f32>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<1.00392163> : tensor<f32>}>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<-0.996078491> : tensor<f32>}>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<0.00784313772> : tensor<f32>}>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<f32>}>
   // CHECK-DAG: [[VAL5:%.+]] = "tf.ClipByValue"(%arg0, [[VAL2]], [[VAL1]])
   // CHECK-DAG: [[VAL6:%.+]] = "tf.Sub"([[VAL5]], [[VAL2]])
   // CHECK-DAG: [[VAL7:%.+]] = "tf.Mul"([[VAL6]], [[VAL0]])
@@ -423,11 +423,11 @@ func.func @fake_quant_with_min_max_args(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf
 }
 
 func.func @fake_quant_with_min_max_vars(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[VAL1:.*]] = "tf.Const"() {value = dense<2.550000e+02> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[VAL2:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[VAL3:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[VAL4:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[VAL1:.*]] = "tf.Const"() <{value = dense<2.550000e+02> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[VAL2:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[VAL3:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[VAL4:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-DAG: %[[VAL5:.*]] = "tf.Sub"(%arg2, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL6:.*]] = "tf.Div"(%[[VAL5]], %[[VAL1]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL7:.*]] = "tf.Div"(%[[VAL1]], %[[VAL5]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -436,17 +436,17 @@ func.func @fake_quant_with_min_max_vars(%arg0 : tensor<?x?xf32>, %arg1 : tensor<
   // CHECK-DAG: %[[VAL10:.*]] = "tf.Floor"(%[[VAL9]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL11:.*]] = "tf.Sub"(%[[VAL9]], %[[VAL10]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL12:.*]] = "tf.Greater"(%[[VAL11]], %[[VAL4]]) : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK-DAG: %[[VAL13:.*]] = "tf.Equal"(%[[VAL11]], %[[VAL4]]) {incompatible_shape_error = true} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-DAG: %[[VAL13:.*]] = "tf.Equal"(%[[VAL11]], %[[VAL4]]) <{incompatible_shape_error = true}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK-DAG: %[[VAL14:.*]] = "tf.Mul"(%[[VAL9]], %[[VAL4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL15:.*]] = "tf.Floor"(%[[VAL14]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL16:.*]] = "tf.Mul"(%[[VAL15]], %[[VAL2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL17:.*]] = "tf.Sub"(%[[VAL10]], %[[VAL16]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-DAG: %[[VAL18:.*]] = "tf.Equal"(%[[VAL17]], %[[VAL3]]) {incompatible_shape_error = true} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-DAG: %[[VAL18:.*]] = "tf.Equal"(%[[VAL17]], %[[VAL3]]) <{incompatible_shape_error = true}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK-DAG: %[[VAL19:.*]] = "tf.LogicalAnd"(%[[VAL13]], %[[VAL18]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   // CHECK-DAG: %[[VAL20:.*]] = "tf.LogicalOr"(%[[VAL12]], %[[VAL19]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   // CHECK-DAG: %[[VAL21:.*]] = "tf.AddV2"(%[[VAL10]], %[[VAL3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[INNER_SELECT:.*]] = "tf.SelectV2"(%[[VAL20]], %[[VAL21]], %[[VAL10]]) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-DAG: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) {incompatible_shape_error = true}
+  // CHECK-DAG: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) <{incompatible_shape_error = true}>
   // CHECK-DAG: %[[VAL22:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO]], %[[INNER_SELECT]])
   // CHECK-DAG: %[[VAL23:.*]] = "tf.ClipByValue"(%[[VAL22]], %[[ZERO]], %[[VAL1]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-DAG: %[[VAL24:.*]] = "tf.Sub"(%[[ZERO]], %[[VAL23]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -469,29 +469,29 @@ func.func @fake_quant_with_min_max_vars(%arg0 : tensor<?x?xf32>, %arg1 : tensor<
 // CHECK-LABEL: SoftmaxCrossEntropyWithLogits
 // CHECK-SAME: %[[FEATURES:.*]]: tensor<2x3xf32>, %[[LABELS:.*]]: tensor<2x3xf32>
 func.func @SoftmaxCrossEntropyWithLogits(%features: tensor<2x3xf32>, %labels: tensor<2x3xf32>) -> (tensor<2xf32>, tensor<2x3xf32>) {
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-DAG: %[[NEG_LABELS:.*]] = "tf.Neg"(%[[LABELS]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
 
   // LogSoftmax expansion.
-  // CHECK-DAG: %[[LOG_SOFTMAX_MAX:.*]] = "tf.Max"(%[[FEATURES]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[LOG_SOFTMAX_MAX:.*]] = "tf.Max"(%[[FEATURES]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[LOG_SOFTMAX_SHIFTED:.*]] = "tf.Sub"(%[[FEATURES]], %[[LOG_SOFTMAX_MAX]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK-DAG: %[[LOG_SOFTMAX_EXP:.*]] = "tf.Exp"(%[[LOG_SOFTMAX_SHIFTED]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[LOG_SOFTMAX_SUM:.*]] = "tf.Sum"(%[[LOG_SOFTMAX_EXP]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[LOG_SOFTMAX_SUM:.*]] = "tf.Sum"(%[[LOG_SOFTMAX_EXP]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[LOG_SOFTMAX_LOG:.*]] = "tf.Log"(%[[LOG_SOFTMAX_SUM]]) : (tensor<2x1xf32>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[LOG_SOFTMAX:.*]] = "tf.Sub"(%[[LOG_SOFTMAX_SHIFTED]], %[[LOG_SOFTMAX_LOG]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
 
 
-  // CHECK-DAG: %[[IS_LABEL_ZERO:.*]] = "tf.Equal"(%[[NEG_LABELS]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xi1>
+  // CHECK-DAG: %[[IS_LABEL_ZERO:.*]] = "tf.Equal"(%[[NEG_LABELS]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xi1>
   // CHECK-DAG: %[[LOSS_INP:.*]] = "tf.Mul"(%[[LOG_SOFTMAX]], %[[NEG_LABELS]]) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
   // CHECK-DAG: %[[SAFE_LOSS_INP:.*]] = "tf.SelectV2"(%[[IS_LABEL_ZERO]], %[[ZERO]], %[[LOSS_INP]]) : (tensor<2x3xi1>, tensor<f32>, tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[LOSS:.*]] = "tf.Sum"(%[[SAFE_LOSS_INP]], %[[AXIS]]) {keep_dims = false} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2xf32>
+  // CHECK-DAG: %[[LOSS:.*]] = "tf.Sum"(%[[SAFE_LOSS_INP]], %[[AXIS]]) <{keep_dims = false}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2xf32>
 
   // Softmax expansion.
-  // CHECK-DAG: %[[SOFTMAX_MAX:.*]] = "tf.Max"(%arg0, %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[SOFTMAX_MAX:.*]] = "tf.Max"(%arg0, %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[SOFTMAX_SHIFTED:.*]] = "tf.Sub"(%[[FEATURES]], %[[SOFTMAX_MAX]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK-DAG: %[[SOFTMAX_EXP:.*]] = "tf.Exp"(%[[SOFTMAX_SHIFTED]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[SOFTMAX_SUM:.*]] = "tf.Sum"(%[[SOFTMAX_EXP]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[SOFTMAX_SUM:.*]] = "tf.Sum"(%[[SOFTMAX_EXP]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[SOFTMAX:.*]] = "tf.Div"(%[[SOFTMAX_EXP]], %[[SOFTMAX_SUM]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
 
   // CHECK-DAG: %[[BACKPROP:.*]] = "tf.Sub"(%[[SOFTMAX]], %[[LABELS]]) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
@@ -534,19 +534,19 @@ func.func @scalar_SoftmaxCrossEntropyWithLogits(%features: tensor<f32>, %labels:
 // CHECK-SAME: %[[FEATURES:.*]]: tensor<2x3xf32>, %[[SPARSE_LABELS:.*]]: tensor<2xi32>
 func.func @SparseSoftmaxCrossEntropyWithLogits(%features: tensor<2x3xf32>, %labels: tensor<2xi32>) -> (tensor<2xf32>, tensor<2x3xf32>) {
   // Convert SPARSE_LABELS to dense LABELS.
-  // CHECK-DAG: %[[DEPTH:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[LABELS:.*]] = "tf.OneHot"(%[[SPARSE_LABELS]], %[[DEPTH]], %[[ONE]], %[[ZERO]]) {axis = 1 : i64} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK-DAG: %[[DEPTH:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[LABELS:.*]] = "tf.OneHot"(%[[SPARSE_LABELS]], %[[DEPTH]], %[[ONE]], %[[ZERO]]) <{axis = 1 : i64}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
 
   // Adjust labels to have Nan for out of range labels.
-  // CHECK-DAG: %[[ZERO_I32:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG: %[[ZERO_I32:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-DAG: %[[IS_NEGATIVE:.*]] = "tf.LessEqual"(%[[ZERO_I32]], %arg1) : (tensor<i32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-DAG: %[[IS_LESS:.*]] = "tf.Less"(%arg1, %[[DEPTH]]) : (tensor<2xi32>, tensor<i32>) -> tensor<2xi1>
   // CHECK-DAG: %[[IS_WITHIN_RANGE:.*]] = "tf.LogicalAnd"(%[[IS_NEGATIVE]], %[[IS_LESS]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  // CHECK-DAG: %[[NAN:.*]] = "tf.Const"() {value = dense<0x7FC00000> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[NAN:.*]] = "tf.Const"() <{value = dense<0x7FC00000> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-DAG: %[[ZERO_OR_NAN:.*]] = "tf.SelectV2"(%[[IS_WITHIN_RANGE]], %[[ZERO]], %[[NAN]]) : (tensor<2xi1>, tensor<f32>, tensor<f32>) -> tensor<2xf32>
-  // CHECK-DAG: %[[NEG_ONE:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
+  // CHECK-DAG: %[[NEG_ONE:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
   // CHECK-DAG: %[[RESHAPE:.*]] = "tf.ExpandDims"(%[[ZERO_OR_NAN]], %[[NEG_ONE]]) : (tensor<2xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[ADJUSTED_LABELS:.*]] = "tf.AddV2"(%[[LABELS]], %[[RESHAPE]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
 
@@ -589,7 +589,7 @@ func.func @SparseSoftmaxCrossEntropyWithLogits_with_dynamic(%features: tensor<*x
 // CHECK-LABEL: func @tanhgrad_float
 // CHECK-SAME: (%[[Y:.*]]: tensor<*xf32>, %[[DY:.*]]: tensor<*xf32>)
 func.func @tanhgrad_float(%y : tensor<*xf32>, %dy: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[Y_SQUARE:.*]] = "tf.Mul"(%[[Y]], %[[Y]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[ONE]], %[[Y_SQUARE]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Mul"(%[[DY]], %[[SUB]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -610,7 +610,7 @@ func.func @tanhgrad_complex(%y : tensor<*xcomplex<f32>>, %dy: tensor<*xcomplex<f
 
 // CHECK-LABEL: func @ZerosLike_unranked
 func.func @ZerosLike_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%arg0) : (tensor<*xi32>) -> tensor<?xi64>
   // CHECK: "tf.BroadcastTo"(%[[ZERO]], %[[SHAPE]]) : (tensor<i32>, tensor<?xi64>) -> tensor<*xi32>
 
@@ -627,7 +627,7 @@ func.func @ZerosLike_variant(%arg0: tensor<!tf_type.variant<tensor<2xi32>>>) ->
 
 // CHECK-LABEL: func @OnesLike_unranked
 func.func @OnesLike_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%arg0) : (tensor<*xi32>) -> tensor<?xi64>
   // CHECK: "tf.BroadcastTo"(%[[ONE]], %[[SHAPE]]) : (tensor<i32>, tensor<?xi64>) -> tensor<*xi32>
 
@@ -682,8 +682,8 @@ func.func @addN_variant(%arg0: tensor<!tf_type.variant<tensor<2xf32>>>, %arg1: t
 
 // CHECK-LABEL: func @DynamicStitch_simple
 func.func @DynamicStitch_simple(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
+  // CHECK: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) <{axis = 0 : i64}> : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
   // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
   // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
   // CHECK: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1x2xf32>, tensor<1x2xf32>, tensor<i64>) -> tensor<2x2xf32>
@@ -696,12 +696,12 @@ func.func @DynamicStitch_simple(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // CHECK-LABEL: DynamicStitch_scalar_matrix_indices
 func.func @DynamicStitch_scalar_matrix_indices(%arg0: tensor<2xf32>, %arg1: tensor<2x2x2xf32>) -> (tensor<5x2xf32>) {
-  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[-1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
   // CHECK-DAG: %[[INP0:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2xf32>, tensor<2xi64>) -> tensor<1x2xf32>
-  // CHECK-DAG: %[[ITEMS0:.*]] = "tf.Unpack"(%[[INP0]]) {axis = 0 : i64} : (tensor<1x2xf32>) -> tensor<2xf32>
+  // CHECK-DAG: %[[ITEMS0:.*]] = "tf.Unpack"(%[[INP0]]) <{axis = 0 : i64}> : (tensor<1x2xf32>) -> tensor<2xf32>
   // CHECK-DAG: %[[INP1:.*]] = "tf.Reshape"(%arg1, %[[SHAPE]]) : (tensor<2x2x2xf32>, tensor<2xi64>) -> tensor<4x2xf32>
-  // CHECK-DAG: %[[ITEMS1:.*]]:4 = "tf.Unpack"(%[[INP1]]) {axis = 0 : i64} : (tensor<4x2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-DAG: %[[ITEMS1:.*]]:4 = "tf.Unpack"(%[[INP1]]) <{axis = 0 : i64}> : (tensor<4x2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-DAG: %[[ITEMS1_3:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#3, %[[AXIS]])
   // CHECK-DAG: %[[ITEMS1_2:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#2, %[[AXIS]])
   // CHECK-DAG: %[[ITEMS1_1:.*]] = "tf.ExpandDims"(%[[ITEMS1]]#1, %[[AXIS]])
@@ -727,8 +727,8 @@ func.func @DynamicStitch_uint8(%arg0: tensor<2x2xui8>) -> tensor<2x2xui8> {
 
 // CHECK-LABEL: func @DynamicStitch_scalar_item
 func.func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) <{axis = 0 : i64}> : (tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
   // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1xf32>, tensor<1xf32>, tensor<i64>) -> tensor<2xf32>
@@ -741,8 +741,8 @@ func.func @DynamicStitch_scalar_item(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // CHECK-LABEL: func @DynamicStitch_matrix_item
 func.func @DynamicStitch_matrix_item(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) <{axis = 0 : i64}> : (tensor<2x2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
   // CHECK-DAG: %[[ITEMS_0:.*]] = "tf.ExpandDims"(%[[ITEMS]]#0, %[[AXIS]])
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[ITEMS_0]], %[[AXIS]]) : (tensor<1x2x2xf32>, tensor<1x2x2xf32>, tensor<i64>) -> tensor<2x2x2xf32>
@@ -762,8 +762,8 @@ func.func @DynamicStitch_dynamic(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) ->
 
 // CHECK-LABEL: func @DynamicStitch_duplicates
 func.func @DynamicStitch_duplicates(%arg0: tensor<2x2xf32>) -> tensor<1x2xf32> {
-  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-DAG: %[[ITEMS:.*]]:2 = "tf.Unpack"(%arg0) <{axis = 0 : i64}> : (tensor<2x2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-DAG: %[[ITEMS_1:.*]] = "tf.ExpandDims"(%[[ITEMS]]#1, %[[AXIS]])
   // CHECK-DAG: %[[RESULT:.*]] = "tf.ConcatV2"(%[[ITEMS_1]], %[[AXIS]]) : (tensor<1x2xf32>, tensor<i64>) -> tensor<1x2xf32>
   // CHECK: return %[[RESULT]]
@@ -783,7 +783,7 @@ func.func @ParallelDynamicStitch(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // CHECK-LABEL: @Reciprocal_i32
 func.func @Reciprocal_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
@@ -791,7 +791,7 @@ func.func @Reciprocal_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 
 // CHECK-LABEL: @Reciprocal_f32
 func.func @Reciprocal_f32(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -799,7 +799,7 @@ func.func @Reciprocal_f32(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @Reciprocal_complexf32
 func.func @Reciprocal_complexf32(%arg0: tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>}> : () -> tensor<complex<f32>>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f32>>, tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
   func.return %0 : tensor<*xcomplex<f32>>
@@ -807,7 +807,7 @@ func.func @Reciprocal_complexf32(%arg0: tensor<*xcomplex<f32>>) -> tensor<*xcomp
 
 // CHECK-LABEL: @Reciprocal_complexf64
 func.func @Reciprocal_complexf64(%arg0: tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>} : () -> tensor<complex<f64>>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>}> : () -> tensor<complex<f64>>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f64>>, tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
   func.return %0 : tensor<*xcomplex<f64>>
@@ -816,7 +816,7 @@ func.func @Reciprocal_complexf64(%arg0: tensor<*xcomplex<f64>>) -> tensor<*xcomp
 // Inv is the same as Reciprocal
 // CHECK-LABEL: @Inv_i32
 func.func @Inv_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
   %0 = "tf.Inv"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
@@ -824,7 +824,7 @@ func.func @Inv_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 
 // CHECK-LABEL: @ScatterNd
 func.func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>} : () -> tensor<8xf32>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<8xf32>}> : () -> tensor<8xf32>
   // CHECK: "tf.TensorScatterAdd"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
 
   %shape = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -856,24 +856,24 @@ func.func @round_int(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: @round
 func.func @round(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[HALF:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[HALF:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[ROUND_VAL:.*]] = "tf.Floor"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[FRACTION:.*]] = "tf.Sub"(%arg0, %[[ROUND_VAL]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[GT:.*]] = "tf.Greater"(%[[FRACTION]], %[[HALF]]) : (tensor<2xf32>, tensor<f32>) -> tensor<2xi1>
-  // CHECK: %[[EQ:.*]] = "tf.Equal"(%[[FRACTION]], %[[HALF]]) {incompatible_shape_error = true} : (tensor<2xf32>, tensor<f32>) -> tensor<2xi1>
+  // CHECK: %[[EQ:.*]] = "tf.Equal"(%[[FRACTION]], %[[HALF]]) <{incompatible_shape_error = true}> : (tensor<2xf32>, tensor<f32>) -> tensor<2xi1>
   // CHECK: %[[MUL1:.*]] = "tf.Mul"(%arg0, %[[HALF]]) : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[FLOOR:.*]] = "tf.Floor"(%[[MUL1]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[MUL2:.*]] = "tf.Mul"(%[[FLOOR]], %[[TWO]]) : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[NEAREST_EVEN_INT:.*]] = "tf.Sub"(%[[ROUND_VAL]], %[[MUL2]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  // CHECK: %[[IS_ODD:.*]] = "tf.Equal"(%[[NEAREST_EVEN_INT]], %[[ONE]]) {incompatible_shape_error = true} : (tensor<2xf32>, tensor<f32>) -> tensor<2xi1>
+  // CHECK: %[[IS_ODD:.*]] = "tf.Equal"(%[[NEAREST_EVEN_INT]], %[[ONE]]) <{incompatible_shape_error = true}> : (tensor<2xf32>, tensor<f32>) -> tensor<2xi1>
   // CHECK: %[[AND:.*]] = "tf.LogicalAnd"(%[[EQ]], %[[IS_ODD]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   // CHECK: %[[OR:.*]] = "tf.LogicalOr"(%[[GT]], %[[AND]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ROUND_VAL]], %[[ONE]]) : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[INNER_SELECT:.*]] = "tf.SelectV2"(%[[OR]], %[[ADD]], %[[ROUND_VAL]]) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  // CHECK-DAG: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) {incompatible_shape_error = true}
+  // CHECK-DAG: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) <{incompatible_shape_error = true}>
   // CHECK-DAG: %[[SELECT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO]], %[[INNER_SELECT]])
   %0 = "tf.Round"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
 
@@ -890,24 +890,24 @@ func.func @round_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 
 // CHECK-LABEL: func @rint_dynamic
 func.func @rint_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[HALF:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[HALF:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[TWO:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[ROUND_VAL:.*]] = "tf.Floor"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK: %[[FRACTION:.*]] = "tf.Sub"(%arg0, %[[ROUND_VAL]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   // CHECK: %[[GT:.*]] = "tf.Greater"(%[[FRACTION]], %[[HALF]]) : (tensor<?xf32>, tensor<f32>) -> tensor<?xi1>
-  // CHECK: %[[EQ:.*]] = "tf.Equal"(%[[FRACTION]], %[[HALF]]) {incompatible_shape_error = true} : (tensor<?xf32>, tensor<f32>) -> tensor<?xi1>
+  // CHECK: %[[EQ:.*]] = "tf.Equal"(%[[FRACTION]], %[[HALF]]) <{incompatible_shape_error = true}> : (tensor<?xf32>, tensor<f32>) -> tensor<?xi1>
   // CHECK: %[[MUL1:.*]] = "tf.Mul"(%arg0, %[[HALF]]) : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK: %[[FLOOR:.*]] = "tf.Floor"(%[[MUL1]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK: %[[MUL2:.*]] = "tf.Mul"(%[[FLOOR]], %[[TWO]]) : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK: %[[NEAREST_EVEN_INT:.*]] = "tf.Sub"(%[[ROUND_VAL]], %[[MUL2]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[IS_ODD:.*]] = "tf.Equal"(%[[NEAREST_EVEN_INT]], %[[ONE]]) {incompatible_shape_error = true} : (tensor<?xf32>, tensor<f32>) -> tensor<?xi1>
+  // CHECK: %[[IS_ODD:.*]] = "tf.Equal"(%[[NEAREST_EVEN_INT]], %[[ONE]]) <{incompatible_shape_error = true}> : (tensor<?xf32>, tensor<f32>) -> tensor<?xi1>
   // CHECK: %[[AND:.*]] = "tf.LogicalAnd"(%[[EQ]], %[[IS_ODD]]) : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
   // CHECK: %[[OR:.*]] = "tf.LogicalOr"(%[[GT]], %[[AND]]) : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ROUND_VAL]], %[[ONE]]) : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK: %[[INNER_SELECT:.*]] = "tf.SelectV2"(%[[OR]], %[[ADD]], %[[ROUND_VAL]]) : (tensor<?xi1>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) {incompatible_shape_error = true}
+  // CHECK: %[[IS_ZERO:.*]] = "tf.Equal"(%[[INNER_SELECT]], %[[ZERO]]) <{incompatible_shape_error = true}>
   // CHECK: %[[SELECT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO]], %[[INNER_SELECT]])
   %0 = "tf.Rint"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
@@ -937,12 +937,12 @@ func.func @lgamma(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 func.func @imag_resize_nearest(%arg0: tensor<1x7x7x1xi32>) -> tensor<1x3x3x1xi32> {
   %shape = "tf.Const"() {device = "", value = dense<3> : tensor<2xi32>} : () -> tensor<2xi32>
 
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<1> : tensor<i32>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<[1, 3, 3, 1]>
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<[1, 49, 1]>
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<[0, 2, 4, 14, 16, 18, 28, 30, 32]> : tensor<9xi32>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<[1, 3, 3, 1]>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<[1, 49, 1]>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<[0, 2, 4, 14, 16, 18, 28, 30, 32]> : tensor<9xi32>}>
   // CHECK: [[VAL4:%.+]] = "tf.Reshape"(%arg0, [[VAL2]])
-  // CHECK: [[VAL5:%.+]] = "tf.GatherV2"([[VAL4]], [[VAL3]], [[VAL0]]) {batch_dims = 0 : i64}
+  // CHECK: [[VAL5:%.+]] = "tf.GatherV2"([[VAL4]], [[VAL3]], [[VAL0]]) <{batch_dims = 0 : i64}>
   // CHECK: [[VAL6:%.+]] = "tf.Reshape"([[VAL5]], [[VAL1]])
   // CHECK: return [[VAL6]]
   %resize = "tf.ResizeNearestNeighbor"(%arg0, %shape) {align_corners = false, device = "", half_pixel_centers = false} : (tensor<1x7x7x1xi32>, tensor<2xi32>) -> tensor<1x3x3x1xi32>
@@ -953,17 +953,17 @@ func.func @imag_resize_nearest(%arg0: tensor<1x7x7x1xi32>) -> tensor<1x3x3x1xi32
 func.func @imag_resize_nearest_dyn_img(%arg0: tensor<1x?x?x1xi32>) -> tensor<1x3x3x1xi32> {
   %shape = "tf.Const"() {device = "", value = dense<3> : tensor<2xi32>} : () -> tensor<2xi32>
 
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<1> : tensor<i32>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<[3, 1]> : tensor<2xi32>}
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<9> : tensor<1xi32>}
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<3> : tensor<1xi32>}
-  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>}
-  // CHECK-DAG: [[VAL5:%.+]] = "tf.Const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00]>
-  // CHECK-DAG: [[VAL6:%.+]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>}
-  // CHECK-DAG: [[VAL7:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<[3, 1]> : tensor<2xi32>}>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<9> : tensor<1xi32>}>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<3> : tensor<1xi32>}>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() <{value = dense<[1, 3]> : tensor<2xi32>}>
+  // CHECK-DAG: [[VAL5:%.+]] = "tf.Const"() <{value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00]>
+  // CHECK-DAG: [[VAL6:%.+]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<f32>}>
+  // CHECK-DAG: [[VAL7:%.+]] = "tf.Const"() <{value = dense<0> : tensor<i64>}>
   // CHECK: [[VAL8:%.+]] = "tf.Shape"(%arg0)
   // CHECK: [[VAL9:%.+]] = "tf.Cast"([[VAL8]])
-  // CHECK: [[VAL10:%.+]]:4 = "tf.Unpack"([[VAL9]]) {axis = 0 : i64}
+  // CHECK: [[VAL10:%.+]]:4 = "tf.Unpack"([[VAL9]]) <{axis = 0 : i64}>
   // CHECK: [[VAL11:%.+]] = "tf.Mul"([[VAL10]]#1, [[VAL10]]#2)
   // CHECK: [[VAL12:%.+]] = "tf.ExpandDims"([[VAL10]]#0, [[VAL7]])
   // CHECK: [[VAL13:%.+]] = "tf.ExpandDims"([[VAL10]]#3, [[VAL7]])
@@ -986,7 +986,7 @@ func.func @imag_resize_nearest_dyn_img(%arg0: tensor<1x?x?x1xi32>) -> tensor<1x3
   // CHECK: [[VAL30:%.+]] = "tf.ExpandDims"([[VAL10]]#3, [[VAL7]])
   // CHECK: [[VAL31:%.+]] = "tf.ConcatV2"([[VAL28]], [[VAL29]], [[VAL30]], [[VAL7]])
   // CHECK: [[VAL32:%.+]] = "tf.Reshape"(%arg0, [[VAL31]])
-  // CHECK: [[VAL33:%.+]] = "tf.GatherV2"([[VAL32]], [[VAL27]], [[VAL0]]) {batch_dims = 0 : i64}
+  // CHECK: [[VAL33:%.+]] = "tf.GatherV2"([[VAL32]], [[VAL27]], [[VAL0]]) <{batch_dims = 0 : i64}>
   // CHECK: [[VAL34:%.+]] = "tf.Reshape"([[VAL33]], [[VAL14]])
   // CHECK: return [[VAL34]]
   %resize = "tf.ResizeNearestNeighbor"(%arg0, %shape) {align_corners = false, device = "", half_pixel_centers = false} : (tensor<1x?x?x1xi32>, tensor<2xi32>) -> tensor<1x3x3x1xi32>
@@ -996,17 +996,17 @@ func.func @imag_resize_nearest_dyn_img(%arg0: tensor<1x?x?x1xi32>) -> tensor<1x3
 // CHECK-LABEL: func @imag_resize_nearest_full_dyn
 func.func @imag_resize_nearest_full_dyn(%arg0: tensor<1x?x?x1xi32>, %arg1: tensor<2xi32>) -> tensor<1x?x?x1xi32> {
 
-  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<1> : tensor<i32>}
-  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
-  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
-  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL5:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() <{value = dense<1> : tensor<1xi64>}>
+  // CHECK-DAG: [[VAL5:%.+]] = "tf.Const"() <{value = dense<0> : tensor<i64>}>
   // CHECK: [[VAL6:%.+]] = "tf.Shape"(%arg0)
   // CHECK: [[VAL7:%.+]] = "tf.Cast"([[VAL6]])
-  // CHECK: [[VAL8:%.+]]:4 = "tf.Unpack"([[VAL7]]) {axis = 0 : i64}
+  // CHECK: [[VAL8:%.+]]:4 = "tf.Unpack"([[VAL7]]) <{axis = 0 : i64}>
   // CHECK: [[VAL9:%.+]] = "tf.Mul"([[VAL8]]#1, [[VAL8]]#2)
-  // CHECK: [[VAL10:%.+]]:2 = "tf.Unpack"(%arg1) {axis = 0 : i64}
+  // CHECK: [[VAL10:%.+]]:2 = "tf.Unpack"(%arg1) <{axis = 0 : i64}>
   // CHECK: [[VAL11:%.+]] = "tf.Mul"([[VAL10]]#0, [[VAL10]]#1)
   // CHECK: [[VAL12:%.+]] = "tf.ExpandDims"([[VAL8]]#0, [[VAL5]])
   // CHECK: [[VAL13:%.+]] = "tf.ExpandDims"([[VAL10]]#0, [[VAL5]])
@@ -1040,7 +1040,7 @@ func.func @imag_resize_nearest_full_dyn(%arg0: tensor<1x?x?x1xi32>, %arg1: tenso
   // CHECK: [[VAL41:%.+]] = "tf.ExpandDims"([[VAL8]]#3, [[VAL5]])
   // CHECK: [[VAL42:%.+]] = "tf.ConcatV2"([[VAL39]], [[VAL40]], [[VAL41]], [[VAL5]])
   // CHECK: [[VAL43:%.+]] = "tf.Reshape"(%arg0, [[VAL42]])
-  // CHECK: [[VAL44:%.+]] = "tf.GatherV2"([[VAL43]], [[VAL38]], [[VAL0]]) {batch_dims = 0 : i64}
+  // CHECK: [[VAL44:%.+]] = "tf.GatherV2"([[VAL43]], [[VAL38]], [[VAL0]]) <{batch_dims = 0 : i64}>
   // CHECK: [[VAL45:%.+]] = "tf.Reshape"([[VAL44]], [[VAL16]])
   // CHECK: return [[VAL45]]
   %resize = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = false, device = "", half_pixel_centers = false} : (tensor<1x?x?x1xi32>, tensor<2xi32>) -> tensor<1x?x?x1xi32>
@@ -1050,8 +1050,8 @@ func.func @imag_resize_nearest_full_dyn(%arg0: tensor<1x?x?x1xi32>, %arg1: tenso
 // CHECK-LABEL: func @xdivy
 // CHECK-SAME: (%[[X:.*]]: tensor<*xf32>, %[[Y:.*]]: tensor<*xf32>)
 func.func @xdivy(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
   // CHECK:  %[[MUL:.*]] = "tf.Div"(%[[X]], %[[Y]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[X]], %[[MUL]]) : (tensor<*xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tf.Xdivy"(%lhs, %rhs) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -1062,8 +1062,8 @@ func.func @xdivy(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: func @xlog1py
 // CHECK-SAME: (%[[X:.*]]: tensor<*xf32>, %[[Y:.*]]: tensor<*xf32>)
 func.func @xlog1py(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
   // CHECK:  %[[LOG:.*]] = "tf.Log1p"(%[[Y]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[MUL:.*]] = "tf.Mul"(%[[X]], %[[LOG]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[X]], %[[MUL]]) : (tensor<*xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -1075,8 +1075,8 @@ func.func @xlog1py(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: func @xlogy
 // CHECK-SAME: (%[[X:.*]]: tensor<*xf32>, %[[Y:.*]]: tensor<*xf32>)
 func.func @xlogy(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:  %[[IS_ZERO:.*]] = "tf.Equal"(%[[X]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
   // CHECK:  %[[LOG:.*]] = "tf.Log"(%[[Y]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[MUL:.*]] = "tf.Mul"(%[[X]], %[[LOG]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[X]], %[[MUL]]) : (tensor<*xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -1089,9 +1089,9 @@ func.func @xlogy(%lhs: tensor<*xf32>, %rhs: tensor<*xf32>) -> tensor<*xf32> {
 func.func @size_to_prod_shape_i32(%arg0 : tensor<1x?x2x3xf32>) -> tensor<i32> {
   %0 = "tf.Size"(%arg0) : (tensor<1x?x2x3xf32>) -> tensor<i32>
   func.return %0 : tensor<i32>
-  // CHECK: %[[CONSTANT:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CONSTANT:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%arg0) : (tensor<1x?x2x3xf32>) -> tensor<4xi32>
-  // CHECK: %[[PROD:.*]] = "tf.Prod"(%[[SHAPE]], %[[CONSTANT]]) {keep_dims = false} : (tensor<4xi32>, tensor<i32>) -> tensor<i32>
+  // CHECK: %[[PROD:.*]] = "tf.Prod"(%[[SHAPE]], %[[CONSTANT]]) <{keep_dims = false}> : (tensor<4xi32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %[[PROD]]
 }
 
@@ -1099,9 +1099,9 @@ func.func @size_to_prod_shape_i32(%arg0 : tensor<1x?x2x3xf32>) -> tensor<i32> {
 func.func @size_to_prod_shape_i64(%arg0 : tensor<1x?x2x3xf32>) -> tensor<i64> {
   %0 = "tf.Size"(%arg0) : (tensor<1x?x2x3xf32>) -> tensor<i64>
   func.return %0 : tensor<i64>
-  // CHECK: %[[CONSTANT:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  // CHECK: %[[CONSTANT:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
   // CHECK: %[[SHAPE:.*]] = "tf.Shape"(%arg0) : (tensor<1x?x2x3xf32>) -> tensor<4xi64>
-  // CHECK: %[[PROD:.*]] = "tf.Prod"(%[[SHAPE]], %[[CONSTANT]]) {keep_dims = false} : (tensor<4xi64>, tensor<i64>) -> tensor<i64>
+  // CHECK: %[[PROD:.*]] = "tf.Prod"(%[[SHAPE]], %[[CONSTANT]]) <{keep_dims = false}> : (tensor<4xi64>, tensor<i64>) -> tensor<i64>
   // CHECK: return %[[PROD]]
 }
 
@@ -1109,9 +1109,9 @@ func.func @size_to_prod_shape_i64(%arg0 : tensor<1x?x2x3xf32>) -> tensor<i64> {
 func.func @is_finite(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
   %0 = "tf.IsFinite"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xi1>
   func.return %0 : tensor<3x4xi1>
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %arg0) : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[SUB]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
+  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[SUB]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
   // CHECK: return %[[RESULT]]
 }
 
@@ -1119,9 +1119,9 @@ func.func @is_finite(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
 func.func @is_finite_dynamic(%arg0: tensor<?x4xf32>) -> tensor<?x4xi1> {
   %0 = "tf.IsFinite"(%arg0) : (tensor<?x4xf32>) -> tensor<?x4xi1>
   func.return %0 : tensor<?x4xi1>
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %arg0) : (tensor<?x4xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[SUB]], %[[ZERO]]) {incompatible_shape_error = true} : (tensor<?x4xf32>, tensor<f32>) -> tensor<?x4xi1>
+  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[SUB]], %[[ZERO]]) <{incompatible_shape_error = true}> : (tensor<?x4xf32>, tensor<f32>) -> tensor<?x4xi1>
   // CHECK: return %[[RESULT]]
 }
 
@@ -1131,11 +1131,11 @@ func.func @roll_scalar_axis(%arg0: tensor<3x8x4xi32>) -> tensor<3x8x4xi32> {
   %0 = "tf.Roll"(%arg0, %shift, %axis) : (tensor<3x8x4xi32>, tensor<i32>, tensor<i32>) -> tensor<3x8x4xi32>
   func.return %0 : tensor<3x8x4xi32>
   // CHECK-LABEL: roll_scalar_axis
-  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() {value = dense<[0, 6, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() {value = dense<[3, 2, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() {value = dense<[3, 6, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 6, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() <{value = dense<[3, 2, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() <{value = dense<0> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() <{value = dense<[3, 6, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK:  %[[SLICE:.*]] = "tf.Slice"(%arg0, %[[CST]], %[[CST0]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<3x2x4xi32>
   // CHECK:  %[[SLICE1:.*]] = "tf.Slice"(%arg0, %[[CST1]], %[[CST2]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<3x6x4xi32>
   // CHECK:  %[[CONCAT:.*]] = "tf.ConcatV2"(%[[SLICE]], %[[SLICE1]], %[[CST3]]) : (tensor<3x2x4xi32>, tensor<3x6x4xi32>, tensor<i32>) -> tensor<3x8x4xi32>
@@ -1148,11 +1148,11 @@ func.func @roll_1d_axis(%arg0: tensor<3x8x4xi32>) -> tensor<3x8x4xi32> {
   %0 = "tf.Roll"(%arg0, %shift, %axis) : (tensor<3x8x4xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3x8x4xi32>
   func.return %0 : tensor<3x8x4xi32>
   // CHECK-LABEL: roll_1d_axis
-  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() {value = dense<[0, 6, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() {value = dense<[3, 2, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() {value = dense<[3, 6, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() <{value = dense<[0, 6, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() <{value = dense<[3, 2, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() <{value = dense<0> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() <{value = dense<[3, 6, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK:  %[[SLICE:.*]] = "tf.Slice"(%arg0, %[[CST]], %[[CST0]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<3x2x4xi32>
   // CHECK:  %[[SLICE1:.*]] = "tf.Slice"(%arg0, %[[CST1]], %[[CST2]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<3x6x4xi32>
   // CHECK:  %[[CONCAT:.*]] = "tf.ConcatV2"(%[[SLICE]], %[[SLICE1]], %[[CST3]]) : (tensor<3x2x4xi32>, tensor<3x6x4xi32>, tensor<i32>) -> tensor<3x8x4xi32>
@@ -1165,15 +1165,15 @@ func.func @roll_multiple_axis(%arg0: tensor<3x8x4xi32>) -> tensor<3x8x4xi32> {
   %0 = "tf.Roll"(%arg0, %shift, %axis) : (tensor<3x8x4xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<3x8x4xi32>
   func.return %0 : tensor<3x8x4xi32>
   // CHECK-LABEL: roll_multiple_axis
-  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() {value = dense<[1, 0, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() {value = dense<[2, 8, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() {value = dense<[1, 8, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() {value = dense<[0, 6, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST4:.*]] = "tf.Const"() {value = dense<[3, 2, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST5:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST6:.*]] = "tf.Const"() {value = dense<[3, 6, 4]> : tensor<3xi64>} : () -> tensor<3xi64>
-  // CHECK-DAG:  %[[CST7:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 0, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST0:.*]] = "tf.Const"() <{value = dense<[2, 8, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST1:.*]] = "tf.Const"() <{value = dense<[1, 8, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG:  %[[CST3:.*]] = "tf.Const"() <{value = dense<[0, 6, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST4:.*]] = "tf.Const"() <{value = dense<[3, 2, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST5:.*]] = "tf.Const"() <{value = dense<0> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST6:.*]] = "tf.Const"() <{value = dense<[3, 6, 4]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  // CHECK-DAG:  %[[CST7:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   // CHECK:      %[[SLICE:.*]] = "tf.Slice"(%arg0, %[[CST]], %[[CST0]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<2x8x4xi32>
   // CHECK:      %[[SLICE1:.*]] = "tf.Slice"(%arg0, %[[CST5]], %[[CST1]]) : (tensor<3x8x4xi32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x8x4xi32>
   // CHECK:      %[[CONCAT:.*]] = "tf.ConcatV2"(%[[SLICE]], %[[SLICE1]], %[[CST2]]) : (tensor<2x8x4xi32>, tensor<1x8x4xi32>, tensor<i32>) -> tensor<3x8x4xi32>
@@ -1184,8 +1184,8 @@ func.func @roll_multiple_axis(%arg0: tensor<3x8x4xi32>) -> tensor<3x8x4xi32> {
 }
 
 func.func @roll_dynamic_shape(%arg0: tensor<?x8x4xi32>) -> tensor<?x8x4xi32> {
-  %axis = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  %shift = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %axis = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  %shift = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
   %0 = "tf.Roll"(%arg0, %shift, %axis) : (tensor<?x8x4xi32>, tensor<i32>, tensor<i32>) -> tensor<?x8x4xi32>
   func.return %0 : tensor<?x8x4xi32>
   // CHECK-LABEL: roll_dynamic_shape
@@ -1193,7 +1193,7 @@ func.func @roll_dynamic_shape(%arg0: tensor<?x8x4xi32>) -> tensor<?x8x4xi32> {
 }
 
 func.func @roll_non_constant_axis(%arg0: tensor<3x8x4xi32>, %arg1: tensor<i32>) -> tensor<3x8x4xi32> {
-  %shift = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %shift = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
   %0 = "tf.Roll"(%arg0, %shift, %arg1) : (tensor<3x8x4xi32>, tensor<i32>, tensor<i32>) -> tensor<3x8x4xi32>
   func.return %0 : tensor<3x8x4xi32>
   // CHECK-LABEL: roll_non_constant_axis
@@ -1201,7 +1201,7 @@ func.func @roll_non_constant_axis(%arg0: tensor<3x8x4xi32>, %arg1: tensor<i32>)
 }
 
 func.func @roll_non_constant_shift(%arg0: tensor<3x8x4xi32>, %arg1: tensor<i32>) -> tensor<3x8x4xi32> {
-  %axis = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %axis = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
   %0 = "tf.Roll"(%arg0, %arg1, %axis) : (tensor<3x8x4xi32>, tensor<i32>, tensor<i32>) -> tensor<3x8x4xi32>
   func.return %0 : tensor<3x8x4xi32>
   // CHECK-LABEL: roll_non_constant_shift
@@ -1213,9 +1213,9 @@ func.func @scatter_nd_updates(%arg0: tensor<14xf32>, %arg1: tensor<1x1xi32>, %ar
   func.return %0 : tensor<14xf32>
 
   // CHECK-LABEL: scatter_nd_updates
-  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK-DAG: %[[CST0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
-  // CHECK-DAG: %[[CST1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<14xf32>} : () -> tensor<14xf32>
+  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK-DAG: %[[CST0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+  // CHECK-DAG: %[[CST1:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<14xf32>}> : () -> tensor<14xf32>
   // CHECK: %[[SCATTER:.*]] = "tf.TensorScatterAdd"(%cst_1, %arg1, %[[CST0]]) : (tensor<14xf32>, tensor<1x1xi32>, tensor<1xf32>) -> tensor<14xf32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[CST]], %[[SCATTER]]) : (tensor<f32>, tensor<14xf32>) -> tensor<14xf32>
   // CHECK: %[[MUL:.*]] = "tf.Mul"(%[[SUB]], %arg0) : (tensor<14xf32>, tensor<14xf32>) -> tensor<14xf32>
@@ -1229,17 +1229,17 @@ func.func @scatter_nd_updates_bool(%arg0: tensor<1x24xi1>, %arg1: tensor<1x2x2xi
   func.return %0 : tensor<1x24xi1>
 
 // CHECK-LABEL: scatter_nd_updates_bool(
-// CHECK-DAG:       %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[CST0:.*]] = "tf.Const"() {value = dense<1> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
-// CHECK-DAG:       %[[CST1:.*]] = "tf.Const"() {value = dense<0> : tensor<1x24xi32>} : () -> tensor<1x24xi32>
-// CHECK:           %[[CAST0:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x24xi1>) -> tensor<1x24xi32>
-// CHECK:           %[[CAST1:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<1x2xi1>) -> tensor<1x2xi32>
+// CHECK-DAG:       %[[CST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG:       %[[CST0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1x2xi32>}> : () -> tensor<1x2xi32>
+// CHECK-DAG:       %[[CST1:.*]] = "tf.Const"() <{value = dense<0> : tensor<1x24xi32>}> : () -> tensor<1x24xi32>
+// CHECK:           %[[CAST0:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<1x24xi1>) -> tensor<1x24xi32>
+// CHECK:           %[[CAST1:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<1x2xi1>) -> tensor<1x2xi32>
 // CHECK:           %[[SCATTER:.*]] = "tf.TensorScatterAdd"(%[[CST1]], %arg1, %[[CST0]]) : (tensor<1x24xi32>, tensor<1x2x2xi32>, tensor<1x2xi32>) -> tensor<1x24xi32>
 // CHECK:           %[[SUB:.*]] = "tf.Sub"(%[[CST]], %[[SCATTER]]) : (tensor<i32>, tensor<1x24xi32>) -> tensor<1x24xi32>
 // CHECK:           %[[MUL:.*]] = "tf.Mul"(%[[SUB]], %[[CAST0]]) : (tensor<1x24xi32>, tensor<1x24xi32>) -> tensor<1x24xi32>
 // CHECK:           %[[SCATTER1:.*]] = "tf.TensorScatterAdd"(%[[CST1]], %arg1, %[[CAST1]]) : (tensor<1x24xi32>, tensor<1x2x2xi32>, tensor<1x2xi32>) -> tensor<1x24xi32>
 // CHECK:           %[[ADD:.*]] = "tf.AddV2"(%[[MUL]], %[[SCATTER1]]) : (tensor<1x24xi32>, tensor<1x24xi32>) -> tensor<1x24xi32>
-// CHECK:           %[[CAST2:.*]] = "tf.Cast"(%[[ADD]]) {Truncate = false} : (tensor<1x24xi32>) -> tensor<1x24xi1>
+// CHECK:           %[[CAST2:.*]] = "tf.Cast"(%[[ADD]]) <{Truncate = false}> : (tensor<1x24xi32>) -> tensor<1x24xi1>
 // CHECK:           return %[[CAST2]] : tensor<1x24xi1>
 }
 
@@ -1250,11 +1250,11 @@ func.func @scatter_nd_updates_bool(%arg0: tensor<1x24xi1>, %arg1: tensor<1x2x2xi
 // CHECK-LABEL: func @simple_softmax
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
 func.func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
-  // CHECK-DAG: %[[MAX:.*]] = "tf.Max"(%[[ARG0]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
+  // CHECK-DAG: %[[MAX:.*]] = "tf.Max"(%[[ARG0]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[SHIFTED:.*]] = "tf.Sub"(%[[ARG0]], %[[MAX]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK-DAG: %[[EXP:.*]] = "tf.Exp"(%[[SHIFTED]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[EXP]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[EXP]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[RESULT:.*]] = "tf.Div"(%[[EXP]], %[[SUM]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK: return %[[RESULT]]
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
@@ -1277,11 +1277,11 @@ func.func @unranked_softmax(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: func @simple_logsoftmax
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
 func.func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
-  // CHECK-DAG: %[[MAX:.*]] = "tf.Max"(%[[ARG0]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
+  // CHECK-DAG: %[[MAX:.*]] = "tf.Max"(%[[ARG0]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[SHIFTED:.*]] = "tf.Sub"(%[[ARG0]], %[[MAX]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK-DAG: %[[EXP:.*]] = "tf.Exp"(%[[SHIFTED]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[EXP]], %[[AXIS]]) {keep_dims = true} : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
+  // CHECK-DAG: %[[SUM:.*]] = "tf.Sum"(%[[EXP]], %[[AXIS]]) <{keep_dims = true}> : (tensor<2x3xf32>, tensor<1xi64>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[LOG:.*]] = "tf.Log"(%[[SUM]]) : (tensor<2x1xf32>) -> tensor<2x1xf32>
   // CHECK-DAG: %[[RESULT:.*]] = "tf.Sub"(%[[SHIFTED]], %[[LOG]]) : (tensor<2x3xf32>, tensor<2x1xf32>) -> tensor<2x3xf32>
   // CHECK: return %[[RESULT]]
@@ -1299,10 +1299,10 @@ func.func @unranked_logsoftmax(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: func @selu
 // CHECK-SAME:  (%[[FEATURES:.*]]: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> {
 func.func @selu(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> {
-    // CHECK-DAG:   %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG:   %[[SCALE:.*]] = "tf.Const"() {value = dense<1.05070102> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG:   %[[SCALED_ALPHA:.*]] = "tf.Const"() {value = dense<1.75809932> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-NEXT:  %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    // CHECK-DAG:   %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG:   %[[SCALE:.*]] = "tf.Const"() <{value = dense<1.05070102> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG:   %[[SCALED_ALPHA:.*]] = "tf.Const"() <{value = dense<1.75809932> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-NEXT:  %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
     // CHECK-DAG:   %[[PRED:.*]] = "tf.Greater"(%[[FEATURES]], %[[ZERO]]) : (tensor<1x4x4x3xf32>, tensor<f32>) -> tensor<1x4x4x3xi1>
     // CHECK-NEXT:  %[[SCALED_FEATURES:.*]] = "tf.Mul"(%[[FEATURES]], %[[SCALE]]) : (tensor<1x4x4x3xf32>, tensor<f32>) -> tensor<1x4x4x3xf32>
     // CHECK-NEXT:  %[[EXP:.*]] = "tf.Exp"(%[[FEATURES]]) : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
@@ -1317,9 +1317,9 @@ func.func @selu(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> {
 // CHECK-LABEL: func @selu_grad
 // CHECK-SAME: (%[[GRADIENTS:.*]]: tensor<4x8xf32>, %[[FEATURES:.*]]: tensor<4x8xf32>) -> tensor<4x8xf32> {
 func.func @selu_grad(%gradients: tensor<4x8xf32>, %features: tensor<4x8xf32>) -> tensor<4x8xf32> {
-    // CHECK-DAG:   %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG:   %[[SCALE:.*]] = "tf.Const"() {value = dense<1.05070102> : tensor<f32>} : () -> tensor<f32>
-    // CHECK-DAG:   %[[SCALED_ALPHA:.*]] = "tf.Const"() {value = dense<1.75809932> : tensor<f32>} : () -> tensor<f32>
+    // CHECK-DAG:   %[[ZERO:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG:   %[[SCALE:.*]] = "tf.Const"() <{value = dense<1.05070102> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG:   %[[SCALED_ALPHA:.*]] = "tf.Const"() <{value = dense<1.75809932> : tensor<f32>}> : () -> tensor<f32>
     // CHECK-DAG:   %[[PRED:.*]] = "tf.Greater"(%[[FEATURES]], %[[ZERO]]) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4x8xi1>
     // CHECK-NEXT:  %[[SCALED_GRADIENTS:.*]] = "tf.Mul"(%[[GRADIENTS]], %[[SCALE]]) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4x8xf32>
     // CHECK-NEXT:  %[[FEATURES_PLUS_SCALED_ALPHA:.*]] = "tf.AddV2"(%[[FEATURES]], %[[SCALED_ALPHA]]) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4x8xf32>
@@ -1335,7 +1335,7 @@ func.func @selu_grad(%gradients: tensor<4x8xf32>, %features: tensor<4x8xf32>) ->
 func.func @expm1(%arg0: tensor<3x4xf32>) -> tensor<3x4xf32> {
   %0 = "tf.Expm1"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
-  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK: %[[EXP:.*]] = "tf.Exp"(%[[ARG0]]) : (tensor<3x4xf32>) -> tensor<3x4xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Sub"(%[[EXP]], %[[ONE]]) : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xf32>
   // CHECK: return %[[RESULT]]
@@ -1344,11 +1344,11 @@ func.func @expm1(%arg0: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK-LABEL: func @matrix_band_part
 // CHECK-SAME: (%[[INPUT:.*]]: tensor<4x5xf32>, %[[NUM_LOWER:.*]]: tensor<i64>, %[[NUM_UPPER:.*]]: tensor<i64>) -> tensor<4x5xf32> {
 func.func @matrix_band_part(%input: tensor<4x5xf32>, %num_lower: tensor<i64>, %num_upper: tensor<i64>) -> tensor<4x5xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[OFFSET:.*]] = "tf.Const"() {{.+}} : () -> tensor<4x5xi64>
-  // CHECK-DAG: %[[M:.*]] = "tf.Const"() {value = dense<4> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[N:.*]] = "tf.Const"() {value = dense<5> : tensor<i64>} : () -> tensor<i64>
-  // CHECK-DAG: %[[ZEROS_LIKE:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x5xf32>} : () -> tensor<4x5xf32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i64>}> : () -> tensor<i64>
+  // CHECK-DAG: %[[OFFSET:.*]] = "tf.Const"() <{{.+}}> : () -> tensor<4x5xi64>
+  // CHECK-DAG: %[[M:.*]] = "tf.Const"() <{value = dense<4> : tensor<i64>}> : () -> tensor<i64>
+  // CHECK-DAG: %[[N:.*]] = "tf.Const"() <{value = dense<5> : tensor<i64>}> : () -> tensor<i64>
+  // CHECK-DAG: %[[ZEROS_LIKE:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<4x5xf32>}> : () -> tensor<4x5xf32>
   // CHECK-DAG: %[[LE:.*]] = "tf.Less"(%[[NUM_LOWER]], %[[ZERO]]) : (tensor<i64>, tensor<i64>) -> tensor<i1>
   // CHECK-DAG: %[[NUM_LOWER_OR_M:.*]] = "tf.SelectV2"(%[[LE]], %[[M]], %[[NUM_LOWER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
   // CHECK-DAG: %[[LE1:.*]] = "tf.Less"(%[[NUM_UPPER]], %[[ZERO]]) : (tensor<i64>, tensor<i64>) -> tensor<i1>
@@ -1373,17 +1373,17 @@ func.func @rank3_matrix_band_part(%input: tensor<?x4x5xf32>, %num_lower: tensor<
 // CHECK-LABEL: func @dynamic_shape_matrix_band_part
 // CHECK-SAME: (%[[INPUT:.*]]: tensor<?x?xf32>, %[[NUM_LOWER:.*]]: tensor<i32>, %[[NUM_UPPER:.*]]: tensor<i32>) -> tensor<?x?xf32> {
 func.func @dynamic_shape_matrix_band_part(%input: tensor<?x?xf32>, %num_lower: tensor<i32>, %num_upper: tensor<i32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[NEG_ONE:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[ZERO_1D:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[ONE_1D:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[TWO_1D:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[ZERO_F32:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[NEG_ONE:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[ZERO_1D:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[ONE_1D:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[TWO_1D:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[ZERO_F32:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-DAG: %[[SHAPE:.*]] = "tf.Shape"(%[[INPUT]]) : (tensor<?x?xf32>) -> tensor<2xi32>
-  // CHECK-DAG: %[[M:.*]] = "tf.StridedSlice"(%[[SHAPE]], %[[ZERO_1D]], %[[ONE_1D]], %[[ONE_1D]]) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  // CHECK-DAG: %[[M:.*]] = "tf.StridedSlice"(%[[SHAPE]], %[[ZERO_1D]], %[[ONE_1D]], %[[ONE_1D]]) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
   // CHECK-DAG: %[[SHAPE1:.*]] = "tf.Shape"(%[[INPUT]]) : (tensor<?x?xf32>) -> tensor<2xi32>
-  // CHECK-DAG: %[[N:.*]] = "tf.StridedSlice"(%[[SHAPE1]], %[[ONE_1D]], %[[TWO_1D]], %[[ONE_1D]]) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  // CHECK-DAG: %[[N:.*]] = "tf.StridedSlice"(%[[SHAPE1]], %[[ONE_1D]], %[[TWO_1D]], %[[ONE_1D]]) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
   // CHECK-DAG: %[[LE:.*]] = "tf.Less"(%[[NUM_LOWER]], %[[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-DAG: %[[NUM_LOWER_OR_M:.*]] = "tf.SelectV2"(%[[LE]], %[[M]], %[[NUM_LOWER]]) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-DAG: %[[LE1:.*]] = "tf.Less"(%[[NUM_UPPER]], %[[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index 837c37b9a71be8..ca1e4c99549d94 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -131,7 +131,7 @@ func.func @ignore_const_foldable_ops(%arg0: tensor<i32>) -> () {
 // CHECK-LABEL: func @op_string_result
 func.func @op_string_result() -> tensor<i32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.Const"
     // CHECK-SAME: _xla_outside_compilation
@@ -148,7 +148,7 @@ func.func @op_string_result() -> tensor<i32> {
 // CHECK-LABEL: func @op_string_operand
 func.func @op_string_operand(%arg0: tensor<!tf_type.string>) -> tensor<i32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.StringToNumber"
     // CHECK-SAME: _xla_outside_compilation
@@ -166,7 +166,7 @@ func.func @op_string_operand(%arg0: tensor<!tf_type.string>) -> tensor<i32> {
 // CHECK-LABEL: func @op_string_operand_string_result
 func.func @op_string_operand_string_result(%arg0: tensor<!tf_type.string>) -> tensor<i32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.Identity"
     // CHECK-SAME: _xla_outside_compilation
@@ -187,7 +187,7 @@ func.func @op_string_operand_string_result(%arg0: tensor<!tf_type.string>) -> te
 // CHECK-LABEL: func @ops_inside_tf_if_outside_compiled
 func.func @ops_inside_tf_if_outside_compiled(%arg0: tensor<i1>, %arg1: tensor<!tf_type.string>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK:      "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK:      "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT:  _xla_outside_compilation
     // CHECK:      "tf.IfRegion"
     // CHECK:        "tf.StringToNumber"
@@ -212,22 +212,22 @@ func.func @ops_inside_tf_if_outside_compiled(%arg0: tensor<i1>, %arg1: tensor<!t
 // CHECK-LABEL: func @if_region_string_op
 func.func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.IfRegion"
+    // CHECK: <{is_stateless
     // CHECK-NOT: _xla_outside_compilation
     %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %2 = "tf.IfRegion"(%arg0) ({
       %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
       "tf.Yield"(%3) : (tensor<f32>) -> ()
      },  {
-      // CHECK: "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"1.0"> : tensor<!tf_type.string>}
+      // CHECK: "tf.Const"() <{value = dense<"1.0"> : tensor<!tf_type.string>}> {_xla_outside_compilation = "auto0"}
       // CHECK-NEXT: "tf.StringToNumber"
       // CHECK-SAME: _xla_outside_compilation
       %4 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
       %5 = "tf.StringToNumber"(%4) {out_type = f32} : (tensor<!tf_type.string>) -> tensor<f32>
       "tf.Yield"(%5) : (tensor<f32>) -> ()
-    // CHECK: {is_stateless
     }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
     %6 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
     tf_device.return %6: tensor<f32>
@@ -241,34 +241,34 @@ func.func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tenso
 // CHECK-LABEL: func @nested_if_region_string_op
 func.func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK: "tf.Const"() <{value = dense<1> : tensor<i32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.IfRegion"
+    // CHECK: <{is_stateless
     // CHECK-NOT: _xla_outside_compilation
     %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %2 = "tf.IfRegion"(%arg0) ({
       %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
       "tf.Yield"(%3) : (tensor<f32>) -> ()
       },  {
-       // CHECK: "tf.Const"() {value = dense<true> : tensor<i1>}
+       // CHECK: "tf.Const"() <{value = dense<true> : tensor<i1>}>
+       // CHECK: <{is_stateless
        // CHECK-NOT: _xla_outside_compilation
        %4 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
        %5 = "tf.IfRegion"(%4)({
-         // CHECK: "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"1.0"> : tensor<!tf_type.string>}
+         // CHECK: "tf.Const"() <{value = dense<"1.0"> : tensor<!tf_type.string>}> {_xla_outside_compilation = "auto0"}
          // CHECK-NEXT: "tf.StringToNumber"
          // CHECK-SAME: _xla_outside_compilation
          %6 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
          %7 = "tf.StringToNumber"(%6) {out_type = f32} : (tensor<!tf_type.string>) -> tensor<f32>
          "tf.Yield"(%7) : (tensor<f32>) -> ()
        },  {
-         // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+         // CHECK: "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
          // CHECK-NOT: _xla_outside_compilation
          %8 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
          "tf.Yield"(%8) : (tensor<f32>) -> ()
-       // CHECK: {is_stateless
        }){is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
        "tf.Yield"(%5) : (tensor<f32>) -> ()
-    // CHECK: {is_stateless
     }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
     %9 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
     tf_device.return %9: tensor<f32>
@@ -282,7 +282,7 @@ func.func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -
 // CHECK-LABEL: func @ops_inside_while_outside_compiled
 func.func @ops_inside_while_outside_compiled(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK:     "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK:     "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK:     "tf.WhileRegion"
     // CHECK:       "tf.StringToNumber"
@@ -313,9 +313,10 @@ func.func @ops_inside_while_outside_compiled(%arg0: tensor<i32>, %arg1: tensor<!
 // CHECK-LABEL: func @while_region_unsupported_op
 func.func @while_region_unsupported_op(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK: "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.WhileRegion"
+    // CHECK: <{is_stateless = true
     %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
     %2:2 = "tf.WhileRegion"(%1, %arg0) ({
       ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
@@ -329,10 +330,9 @@ func.func @while_region_unsupported_op(%arg0: tensor<i32>, %arg1: tensor<!tf_typ
         // CHECK: "tf.UnsupportedOp"
         // CHECK-SAME: _xla_outside_compilation
         %3 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-        // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+        // CHECK: "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
         %4 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
         "tf.Yield"(%4, %sub) : (tensor<f32>, tensor<i32>) -> ()
-    // CHECK: {is_stateless = true
     }) {is_stateless = true} : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
     // CHECK: "tf.Identity"
     // CHECK-NOT: _xla_outside_compilation
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/merge_control_flow.mlir b/tensorflow/compiler/mlir/tensorflow/tests/merge_control_flow.mlir
index bd5f805b810f69..a9e4fc902401b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/merge_control_flow.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/merge_control_flow.mlir
@@ -807,19 +807,19 @@ func.func @nested_IfRegions_with_same_predicate_same_block_level_merged() {
 func.func @two_overlapped_if_groups_with_no_dependency_merged() {
   // CHECK:      tf_device.cluster
   // CHECK:        "tf.IfRegion"
-  // CHECK:          "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<9.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<6.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<1.000000e+01> : tensor<f32>} : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<5.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<9.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<6.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> : () -> tensor<f32>
   // CHECK:        "tf.IfRegion"
-  // CHECK:          "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<7.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<1.100000e+01> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<4.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<8.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK           "tf.Const"() {value = dense<1.200000e+01> : tensor<f32>} : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<3.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<7.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<1.100000e+01> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<4.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<8.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK           "tf.Const"() <{value = dense<1.200000e+01> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-NOT:    "tf.IfRegion"
   "tf_device.cluster"() ({
     %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
@@ -888,12 +888,12 @@ func.func @two_overlapped_if_groups_with_no_dependency_merged() {
   // CHECK:          "tf.E"
   // CHECK:          "tf.F"
   // CHECK:        "tf.IfRegion"
-  // CHECK:          "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK:          "tf.Const"() {value = dense<4.000000e+00> : tensor<f32>} : () -> tensor<f32>
-  // CHECK;          "tf.Const"() {value = dense<6.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<3.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<5.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK:          "tf.Const"() <{value = dense<4.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  // CHECK;          "tf.Const"() <{value = dense<6.000000e+00> : tensor<f32>}> : () -> tensor<f32>
   // CHECK-NOT:    "tf.IfRegion"
 func.func @two_overlapped_if_groups_with_dependency_not_merged_for_first_if_region_group() {
   "tf_device.cluster"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlprogram.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlprogram.mlir
index 5b0958aa37696b..6ded59b51ad8d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlprogram.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlprogram.mlir
@@ -123,7 +123,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK-LABEL @lowers_string_ops
   // CHECK-DAG: ml_program.global public @vars.Variable_1([]) : tensor<!tf_type.string>
   func.func @lowers_string_ops(%arg0: tensor<128xi32>, %arg1: tensor<128xi32>, %arg2: tensor<128x1xi32>, %arg3: tensor<128x90xi32>, %arg4: tensor<128x90xi32>, %arg5: tensor<128x90xi32>, %arg6: tensor<128x90x64xf32>, %arg7: tensor<128x90x64xf32>) -> tensor<!tf_type.string> {
-    // CHECK: %0 = ml_program.global_load @vars.Variable_1 : tensor<!tf_type.string>
+    // CHECK: %[[v0:.*]] = ml_program.global_load @vars.Variable_1 : tensor<!tf_type.string>
     %0 = tf_executor.graph {
       %outputs_4, %control_5 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "Variable"} : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
       %outputs_10, %control_11 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "Variable_1"} : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
index 45ee57ad75d72a..021cad3b78be8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
@@ -3,8 +3,8 @@
 // CHECK-LABEL: @ShardingAttr
 func.func @ShardingAttr(%arg0: tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, %arg1: tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<128x1024xf32> {mhlo.sharding = ""}) -> (tensor<128x10xf32>, tensor<10x1024xf32>, tensor<128x1024xf32>) {
 
-  // CHECK: %[[SHARDED_ARG0:.*]] = "tf.XlaSharding"(%arg0) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", sharding = "\08\03\1A\02\01\02\22\02\00\01"}
-  // CHECK: %[[SHARDED_ARG1:.*]] = "tf.XlaSharding"(%arg1) {_XlaSharding = "\08\01\1A\01\01\22\01\00", sharding = "\08\01\1A\01\01\22\01\00"}
+  // CHECK: %[[SHARDED_ARG0:.*]] = "tf.XlaSharding"(%arg0) <{_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", sharding = "\08\03\1A\02\01\02\22\02\00\01"}>
+  // CHECK: %[[SHARDED_ARG1:.*]] = "tf.XlaSharding"(%arg1) <{_XlaSharding = "\08\01\1A\01\01\22\01\00", sharding = "\08\01\1A\01\01\22\01\00"}>
 
   // CHECK: "tf.Identity"(%[[SHARDED_ARG1]])
   %0 = "tf.Identity"(%arg1) : (tensor<10x1024xf32>) -> tensor<10x1024xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index 14f20b6cf5a18a..faf2a960aed7d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -62,7 +62,7 @@ func.func @main(%arg0: tensor<i1>) {
 func.func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<4.200000e+01> : tensor<f32>}>
   // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST]])
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
@@ -133,7 +133,7 @@ func.func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
 // CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
 func.func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<4.200000e+01> : tensor<f32>}>
   // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%[[CONST]], %[[CONST]])
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
@@ -222,7 +222,7 @@ func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>
 func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.050000e+03> : tensor<f32>}>
   %1 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %1) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK-NEXT: return %[[CONST]] : tensor<f32>
@@ -241,7 +241,7 @@ func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>
 func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
-  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
+  // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.050000e+03> : tensor<f32>}>
   %1 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %1) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK-NEXT: return %[[CONST]], %[[CONST]] : tensor<f32>, tensor<f32>
@@ -257,13 +257,13 @@ func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>
 // CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
 func.func @main(%arg0: tensor<!tf_type.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
-  // CHECK-NEXT: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
+  // CHECK-NEXT: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.200000e+01> : tensor<f32>}>
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
   %1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
   // CHECK-NEXT: %[[ADD:[a-z0-9]+]] = "tf.AddV2"(%[[CONST_0]], %[[CONST_0]])
   %2 = "tf.AddV2"(%1, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT: %[[CONST_1:.*]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
+  // CHECK-NEXT: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1.050000e+03> : tensor<f32>}>
   %3 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %3) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK-NEXT: return %[[ADD]], %[[CONST_1]] : tensor<f32>, tensor<f32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
index e26a299a85c66d..eff3e38ab5ace2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -7,11 +7,11 @@
 // CHECK-NEXT:   "tf.Abs"
 func.func @testSimple(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: "tf.If"
-  // CHECK-SAME: _attr0 = false
-  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   // CHECK-NOT: attr1
   // CHECK-SAME: else_branch = @test_else_name
   // CHECK-SAME: then_branch = @test_then_name
+  // CHECK-SAME: _attr0 = false
+  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
     "tf.Yield"(%1) : (tensor<*xf32>) -> ()
@@ -31,10 +31,10 @@ func.func @testSimple(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:   "tf.Abs"
 func.func @testSimpleEmptyBranchNames(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: "tf.If"
-  // CHECK-SAME: _attr0 = false
   // CHECK-NOT: attr1
   // CHECK-SAME: else_branch = @tf.IfRegion_else
   // CHECK-SAME: then_branch = @tf.IfRegion_then
+  // CHECK-SAME: _attr0 = false
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
     "tf.Yield"(%1) : (tensor<*xf32>) -> ()
@@ -78,7 +78,7 @@ func.func @testIfCondition(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2x
 // CHECK-NEXT: constant dense<0.0
 func.func @testIfConstant(%arg0: tensor<i1>) -> tensor<2xf32> {
   %cst_zero = arith.constant dense<0.0> : tensor<2xf32>
-  // CHECK: "tf.If"(%arg0) {{.*}} else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
+  // CHECK: "tf.If"(%arg0) <{else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then
   %0 = "tf.IfRegion"(%arg0) ({
      "tf.Yield"(%cst_zero) : (tensor<2xf32>) -> ()
     }, {
@@ -98,7 +98,7 @@ func.func @testIfConstant(%arg0: tensor<i1>) -> tensor<2xf32> {
 // CHECK: func private @tf.IfRegion1_then
 // CHECK-NEXT: "tf.LogicalNot"
 // CHECK-NEXT: "tf.Asin"
-// CHECK-NEXT: "tf.If"({{.+}}) {{.*}} else_branch = @tf.IfRegion_else, {{.+}} then_branch = @tf.IfRegion_then}
+// CHECK-NEXT: "tf.If"({{.+}}) <{else_branch = @tf.IfRegion_else, {{.+}} then_branch = @tf.IfRegion_then}
 
 // CHECK: func private @tf.IfRegion_else
 // CHECK-NEXT: "tf.Neg"
@@ -106,7 +106,7 @@ func.func @testIfConstant(%arg0: tensor<i1>) -> tensor<2xf32> {
 // CHECK-NEXT: "tf.Abs"
 
 func.func @testNested(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: "tf.If"({{.+}}) {{.*}} else_branch = @tf.IfRegion1_else, {{.+}} then_branch = @tf.IfRegion1_then}
+  // CHECK: "tf.If"({{.+}}) <{else_branch = @tf.IfRegion1_else, {{.+}} then_branch = @tf.IfRegion1_then}
   %0 = "tf.IfRegion"(%arg0) ({
     // Outer Then
     %cond = "tf.LogicalNot"(%arg0) : (tensor<i1>) -> tensor<i1>
@@ -137,7 +137,7 @@ func.func @testNested(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: "tf.If"({{.+}}) {{.*}} else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  // CHECK: "tf.If"({{.+}}) <{else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = func.call @testIf1Then(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
     "tf.Yield"(%1) : (tensor<*xf32>) -> ()
@@ -155,7 +155,7 @@ func.func @testIf1Result(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> tensor<*xf3
 func.func private @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: "tf.If"({{.+}}) {{.*}} else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  // CHECK: "tf.If"({{.+}}) <{else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<*xf32>
     %2 = func.call @testIf1Then(%1) : (tensor<*xf32>) -> tensor<*xf32>
@@ -175,7 +175,7 @@ func.func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf3
 func.func private @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: "tf.If"({{.+}}) {{.*}} else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  // CHECK: "tf.If"({{.+}}) <{else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xf32>) -> tensor<?xf32>
     %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<?xf32>) -> tensor<*xf32>
@@ -197,8 +197,8 @@ func.func @testIf2Result(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf3
 func.func private @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIfExternIncompatibleCastTrivialTransform(%arg0: tensor<i1>, %arg1: tensor<2xi64>) -> tensor<2xf32> {
-  // CHECK: %[[CAST:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xi64>) -> tensor<*xf32>
-  // CHECK: "tf.If"(%arg0, %[[CAST]]) {{.*}} else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
+  // CHECK: %[[CAST:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<2xi64>) -> tensor<*xf32>
+  // CHECK: "tf.If"(%arg0, %[[CAST]]) <{else_branch = @testIf1Else, {{.+}} then_branch = @testIf1Then}
   %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xi64>) -> tensor<*xf32>
   %0 = "tf.IfRegion"(%arg0) ({
     %2 = func.call @testIf1Then(%1) : (tensor<*xf32>) -> tensor<*xf32>
@@ -221,7 +221,7 @@ func.func @testIfExternIncompatibleCastTrivialTransform(%arg0: tensor<i1>, %arg1
 func.func private @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
 func.func private @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
 func.func @testIfIncompatibleCastTrivialTransform(%arg0: tensor<i1>, %arg1: tensor<2xi64>) -> tensor<2xf32> {
-  // CHECK: "tf.If"(%arg0, %arg1) {{.*}} else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then}
+  // CHECK: "tf.If"(%arg0, %arg1) <{else_branch = @tf.IfRegion_else{{.+}}then_branch = @tf.IfRegion_then}
   %0 = "tf.IfRegion"(%arg0) ({
     %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2xi64>) -> tensor<*xf32>
     %2 = func.call @testIf1Then(%1) : (tensor<*xf32>) -> tensor<*xf32>
@@ -341,11 +341,11 @@ func.func @testCase(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<1
 // CHECK-LABEL: testValidWhileRegion
 func.func @testValidWhileRegion(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
   // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1)
-  // CHECK-SAME: _attr0 = false
-  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   // CHECK-NOT: attr1
   // CHECK-SAME: body = @tf.WhileRegion_body
   // CHECK-SAME: cond = @tf.WhileRegion_cond
+  // CHECK-SAME: _attr0 = false
+  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       // condition, check if count has reached 0
@@ -379,7 +379,7 @@ func.func @testValidWhileRegion(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> t
 // CHECK: "tf.NotEqual"
 // CHECK-LABEL: testWhileRegionTypeMismatch
 func.func @testWhileRegionTypeMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       // condition, check if count has reached 0
@@ -415,7 +415,7 @@ func.func @testWhileRegionTypeMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32
 func.func @testWhileRegionConstantSink(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
   %zero = arith.constant dense<0> : tensor<i32>
   %one = arith.constant dense<1> : tensor<i32>
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
@@ -448,7 +448,7 @@ func.func @testWhileRegionConstantSink(%arg0 : tensor<*xf32>, %arg1 : tensor<i32
 func.func @testWhileRegionExternInCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
   %cst = arith.constant dense<4> : tensor<i32>
   %limit = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -485,7 +485,7 @@ func.func @testWhileRegionExternInBody(%arg0 : tensor<*xf32>, %arg1 : tensor<i32
   %zero = arith.constant dense<0> : tensor<i32>
   %cst = arith.constant dense<4> : tensor<i32>
   %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -516,7 +516,7 @@ func.func @testWhileRegionExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 : ten
   %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %cst1 = arith.constant dense<44> : tensor<i32>
   %limit = "tf.Add"(%arg2, %cst1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[Result:%.*]]:4 = "tf.While"(%arg0, %arg1, %{{.+}}, %{{.+}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:4 = "tf.While"(%arg0, %arg1, %{{.+}}, %{{.+}} <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -545,7 +545,7 @@ func.func @testWhileRegionExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 : ten
 func.func @testWhileRegionSameExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<i32>) -> tensor<*xf32> {
   %cst = arith.constant dense<4> : tensor<i32>
   %stride = "tf.Add"(%arg2, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -573,7 +573,7 @@ func.func @testWhileRegionSameExternInBodyAndCond(%arg0 : tensor<*xf32>, %arg1 :
 func.func private @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i1>
 func.func private @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
 func.func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @while_body, cond = @while_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @while_body, cond = @while_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -600,7 +600,7 @@ func.func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) ->
 func.func private @while_cond(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<i1>
 func.func private @while_body(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
 func.func @testWhileRegionTrivialCasts(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @while_body, cond = @while_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @while_body, cond = @while_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -629,7 +629,7 @@ func.func @testWhileRegionTrivialCasts(%arg0 : tensor<*xf32>, %arg1 : tensor<i32
 func.func private @while_cond(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<i1>
 func.func private @while_body(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
 func.func @testWhileRegionTrivialMultipleCasts(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @while_body, cond = @while_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @while_body, cond = @while_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -662,7 +662,7 @@ func.func @testWhileRegionTrivialMultipleCasts(%arg0 : tensor<*xf32>, %arg1 : te
 func.func private @while_cond(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> tensor<i1>
 func.func private @while_body(%arg0 : tensor<4xf32>, %arg1 : tensor<i32>) -> (tensor<4xi64>, tensor<i32>)
 func.func @testWhileRegionIncompatibleCast(%arg0 : tensor<*xi64>, %arg1 : tensor<i32>) -> tensor<*xi64> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xi64>, %carg1: tensor<i32>):
@@ -694,7 +694,7 @@ func.func private @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> ten
 func.func private @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>, %arg2 : tensor<*xf32>) -> (tensor<*xf32>, tensor<i32>)
 func.func @testWhileRegionExtern(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
   %ext = "tf.Neg"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
-  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:3 = "tf.While"(%arg0, %arg1, %{{.+}} <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -723,7 +723,7 @@ func.func @testWhileRegionExtern(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) ->
 func.func private @while_cond(%arg0 : tensor<i32>, %arg1 : tensor<*xf32>) -> tensor<i1>
 func.func private @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
 func.func @testWhileRegionBlockArgMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @tf.WhileRegion_body, cond = @tf.WhileRegion_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -750,7 +750,7 @@ func.func @testWhileRegionBlockArgMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor
 func.func private @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i32>
 func.func private @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
 func.func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
-  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {{.*}} body = @while_body, cond = @while_cond
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) <{body = @while_body, cond = @while_cond
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
@@ -830,11 +830,11 @@ func.func @testOverrideIfRegionXlaPropageCompileTimeConsts(%arg0: tensor<i1>, %a
 // CHECK-LABEL: testValidWhileRegion
 func.func @testValidWhileRegion(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
   // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1)
-  // CHECK-SAME: _attr0 = false
-  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   // CHECK-NOT: attr1
   // CHECK-SAME: body = @tf.WhileRegion_body
   // CHECK-SAME: cond = @tf.WhileRegion_cond
+  // CHECK-SAME: _attr0 = false
+  // CHECK-SAME: _xla_propagate_compile_time_consts = true
   %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
     {
       // condition, check if count has reached 0
@@ -881,3 +881,82 @@ func.func @testPassThroughCond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> te
   ) { is_stateless = false, _attr0 = false, attr1 = "hello"} : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
   func.return %0#0 : tensor<*xf32>
 }
+
+// -----
+
+func.func @init(%arg0: tensor<4xf32>) -> tensor<7xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<7xf32>
+  return %0 : tensor<7xf32>
+}
+func.func @next(%arg0: tensor<7xf32>, %arg1: tensor<3xf32>) -> tensor<6xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<6xf32>
+  return %0 : tensor<6xf32>
+}
+func.func @finalize(%arg0: tensor<6xf32>, %arg1: tensor<2xf32>) -> tensor<5xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: testGeneratorDatasetRegion
+func.func @testGeneratorDatasetRegion(%arg0: tensor<4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<!tf_type.resource>, %arg3: tensor<2xf32>) {
+  // CHECK: "tf.GeneratorDataset"
+  // CHECK-DAG: @init
+  // CHECK-DAG: @next
+  // CHECK-DAG: @finalize
+  // CHECK: return
+  %0 = "tf.GeneratorDatasetRegion"(%arg0, %arg1, %arg2, %arg3) ({
+  ^bb0(%arg4: tensor<4xf32>):
+    %1 = func.call @init(%arg4) : (tensor<4xf32>) -> tensor<7xf32>
+    "tf.Yield"(%1) : (tensor<7xf32>) -> ()
+  }, {
+  ^bb0(%arg4: tensor<7xf32>, %arg5: tensor<3xf32>):
+    %1 = func.call @next(%arg4, %arg5) : (tensor<7xf32>, tensor<3xf32>) -> tensor<6xf32>
+    "tf.Yield"(%1) : (tensor<6xf32>) -> ()
+  }, {
+  ^bb0(%arg4: tensor<6xf32>, %arg5: tensor<2xf32>):
+    %1 = func.call @finalize(%arg4, %arg5) : (tensor<6xf32>, tensor<2xf32>) -> tensor<5xf32>
+    "tf.Yield"(%1) : (tensor<5xf32>) -> ()
+  }) {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", metadata = "", operandSegmentSizes = array<i32: 1, 2, 1>, output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string]} : (tensor<4xf32>, tensor<3xf32>, tensor<!tf_type.resource>, tensor<2xf32>) -> tensor<!tf_type.variant>
+  return
+}
+
+// -----
+
+func.func @init(%arg0: tensor<4xf32>) -> tensor<7xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<7xf32>
+  return %0 : tensor<7xf32>
+}
+func.func @next(%arg0: tensor<3xf32>, %arg1: tensor<7xf32>) -> tensor<6xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<6xf32>
+  return %0 : tensor<6xf32>
+}
+func.func @finalize(%arg0: tensor<6xf32>, %arg1: tensor<2xf32>) -> tensor<5xf32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: testGeneratorDatasetRegionWithComplexBlocks
+func.func @testGeneratorDatasetRegionWithComplexBlocks(%arg0: tensor<4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<!tf_type.resource>, %arg3: tensor<2xf32>) {
+  // CHECK: "tf.GeneratorDataset"
+  // CHECK-NOT: @init
+  // CHECK-NOT: @next
+  // CHECK-NOT: @finalize
+  // CHECK: -> tensor<!tf_type.variant>
+  // CHECK: return
+  %0 = "tf.GeneratorDatasetRegion"(%arg0, %arg1, %arg2, %arg3) ({
+  ^bb0(%arg4: tensor<4xf32>):
+    %sum = "tf.Add"(%arg4, %arg4) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    %1 = func.call @init(%sum) : (tensor<4xf32>) -> tensor<7xf32>
+    "tf.Yield"(%1) : (tensor<7xf32>) -> ()
+  }, {
+  ^bb0(%arg4: tensor<7xf32>, %arg5: tensor<3xf32>):
+    %1 = func.call @next(%arg5, %arg4) : (tensor<3xf32>, tensor<7xf32>) -> tensor<6xf32>
+    "tf.Yield"(%1) : (tensor<6xf32>) -> ()
+  }, {
+  ^bb0(%arg4: tensor<6xf32>, %arg5: tensor<2xf32>):
+    %1 = func.call @finalize(%arg4, %arg5) : (tensor<6xf32>, tensor<2xf32>) -> tensor<5xf32>
+    %sum = "tf.Add"(%1, %1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+    "tf.Yield"(%sum) : (tensor<5xf32>) -> ()
+  }) {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", metadata = "", operandSegmentSizes = array<i32: 1, 2, 1>, output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string]} : (tensor<4xf32>, tensor<3xf32>, tensor<!tf_type.resource>, tensor<2xf32>) -> tensor<!tf_type.variant>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 8e0e4558b851f0..8fec2a5bb55223 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -268,10 +268,10 @@ func.func @device_ordinals() {
   func.return
 }
 
-// CHECK: tf_executor.island wraps "tf.Const"() {_parallel_execution_ids = "r0:0", value = dense<1> : tensor<i64>}
-// CHECK: tf_executor.island wraps "tf.Const"() {_parallel_execution_ids = "r0:0", value = dense<3> : tensor<i64>}
-// CHECK: tf_executor.island wraps "tf.Const"() {_parallel_execution_ids = "r0:1", value = dense<2> : tensor<i64>}
-// CHECK: tf_executor.island wraps "tf.Const"() {_parallel_execution_ids = "r0:1", value = dense<4> : tensor<i64>}
+// CHECK: tf_executor.island wraps "tf.Const"() <{value = dense<1> : tensor<i64>}> {_parallel_execution_ids = "r0:0"}
+// CHECK: tf_executor.island wraps "tf.Const"() <{value = dense<3> : tensor<i64>}> {_parallel_execution_ids = "r0:0"}
+// CHECK: tf_executor.island wraps "tf.Const"() <{value = dense<2> : tensor<i64>}> {_parallel_execution_ids = "r0:1"}
+// CHECK: tf_executor.island wraps "tf.Const"() <{value = dense<4> : tensor<i64>}> {_parallel_execution_ids = "r0:1"}
 
 // -----
 // Tests parallel_execute nested inside replicate
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 02c144467d92c5..9c00c8ee5f849c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -221,7 +221,7 @@ func.func @named_internal_resource() -> tensor<*xi32> {
 
 // CHECK-LABEL: func @cluster_with_loop
 func.func @cluster_with_loop() -> () {
-  // CHECK: %[[COUNT:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK: %[[COUNT:.*]] = "tf.Const"() <{value = dense<10> : tensor<i32>}>
   %0 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<f32>>>
@@ -253,7 +253,7 @@ func.func @while_body(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tens
   // CHECK-NEXT: %[[ADD1:.*]] = "tf.AddV2"(%[[ADD0]], %[[ADD0]])
   %add1 = "tf.AddV2"(%read1, %read1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   "tf.AssignVariableOp"(%arg1, %add1) : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
-  // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>}
+  // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}>
   %constant = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: %[[ADD2:.*]] = "tf.AddV2"(%[[BARG0]], %[[DELTA]])
   %add2 = "tf.AddV2"(%arg0, %constant) : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -299,7 +299,7 @@ func.func @while_body(%arg0: tensor<*x!tf_type.resource<tensor<f32>>>) -> (tenso
   // CHECK-NEXT: return %[[CONST]]
   func.return %arg0 : tensor<*x!tf_type.resource<tensor<f32>>>
 }
-// CHECK: func @while_cond(%arg0: tensor<f32>)
+// CHECK: func @while_cond(%[[CARG0:.*]]: tensor<f32>)
 func.func @while_cond(%arg0: tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32> {
   %id = "tf.Identity"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource<tensor<f32>>>
   %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
@@ -935,7 +935,7 @@ func.func @cluster_with_caseregion(%arg0: tensor<i32>) -> tensor<4xf32> {
 
 // CHECK-LABEL: func @cluster_with_whileregion
 func.func @cluster_with_whileregion() -> () {
-  // CHECK: %[[COUNT:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK: %[[COUNT:.*]] = "tf.Const"() <{value = dense<10> : tensor<i32>}>
   // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
   // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
@@ -959,7 +959,7 @@ func.func @cluster_with_whileregion() -> () {
             // CHECK: (%[[BARG0:.+]]: tensor<i32>, %[[BARG1:.+]]: tensor<f32>):
             // CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[BARG1]], %[[BARG1]])
             // CHECK-NEXT: %[[ADD1:.*]] = "tf.AddV2"(%[[ADD0]], %[[ADD0]])
-            // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>}
+            // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}>
             // CHECK-NEXT: %[[ADD2:.*]] = "tf.AddV2"(%[[BARG0]], %[[DELTA]])
             // CHECK-NEXT: "tf.Yield"(%[[ADD2]], %[[ADD1]])
             ^bb1(%barg0: tensor<i32>, %barg1: !tf_ref, %barg2: !tf_ref, %barg3: !tf_ref):
@@ -1046,7 +1046,7 @@ func.func @if_region_with_store_in_then(%arg0: tensor<i1>) {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
   "tf_device.cluster"() ({
     "tf.IfRegion"(%arg0) ({
-       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
        // CHECK: "tf.Yield"(%[[CONST]])
        %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
        "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
@@ -1074,13 +1074,13 @@ func.func @if_region_with_store_in_both(%arg0: tensor<i1>) {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<4xf32>>>
   "tf_device.cluster"() ({
     "tf.IfRegion"(%arg0) ({
-       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
        // CHECK: "tf.Yield"(%[[CONST]])
        %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
        "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
        "tf.Yield"() : () -> ()
       }, {
-       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00>
+       // CHECK: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
        // CHECK: "tf.Yield"(%[[CONST]])
        %constant = "tf.Const"() {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
        "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
@@ -1123,8 +1123,8 @@ func.func @test_unsupported_resource_op() -> tensor<*xi32> {
 // to not be lifted and arg1 to be lifted.
 // CHECK-LABEL: func @test_unsupported_resource_op_in_if
 func.func @test_unsupported_resource_op_in_if(%arg0: tensor<i1>) -> tensor<*xi32> {
-  // CHECK: [[VH0:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
-  // CHECK: [[VH1:%.*]] = "tf.VarHandleOp"() {container = "d", shared_name = "w"}
+  // CHECK: [[VH0:%.*]] = "tf.VarHandleOp"() <{container = "c", shared_name = "v"}>
+  // CHECK: [[VH1:%.*]] = "tf.VarHandleOp"() <{container = "d", shared_name = "w"}>
   // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
   // CHECK: [[READ1:%.*]] = "tf.ReadVariableOp"([[VH1]])
   // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/rewrite_tpu_embedding_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/rewrite_tpu_embedding_ops.mlir
index 0f1344ce73a8d0..0099129136f79a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/rewrite_tpu_embedding_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/rewrite_tpu_embedding_ops.mlir
@@ -2,8 +2,8 @@
 
 // CHECK-LABEL: func @recv_tpu_embedding_activations
 func.func @recv_tpu_embedding_activations() -> (tensor<512x256xf32>) {
-  // CHECK: %[[DATA:.*]] = "tf.XlaRecvTPUEmbeddingDeduplicationData"() {config = {{.*}}} : () -> tensor<!tf_type.variant>
-  // CHECK: %[[RESULT:.*]] = "tf.XlaRecvTPUEmbeddingActivations"(%[[DATA]]) {config = {{.*}}} : (tensor<!tf_type.variant>) -> tensor<512x256xf32>
+  // CHECK: %[[DATA:.*]] = "tf.XlaRecvTPUEmbeddingDeduplicationData"() <{config = {{.*}}}> : () -> tensor<!tf_type.variant>
+  // CHECK: %[[RESULT:.*]] = "tf.XlaRecvTPUEmbeddingActivations"(%[[DATA]]) <{config = {{.*}}}> : (tensor<!tf_type.variant>) -> tensor<512x256xf32>
   // CHECK: return %[[RESULT]]
   // CHECK-NOT: tf.RecvTPUEmbeddingActivations
   // CHECK-NOT: tf.SendTPUEmbeddingGradients
@@ -14,8 +14,8 @@ func.func @recv_tpu_embedding_activations() -> (tensor<512x256xf32>) {
 
 // CHECK-LABEL: func @send_tpu_embedding_gradients
 func.func @send_tpu_embedding_gradients(%arg0: tensor<512x256xf32>) -> () {
-  // CHECK: %[[DATA:.*]] = "tf.XlaRecvTPUEmbeddingDeduplicationData"() {config = {{.*}}} : () -> tensor<!tf_type.variant>
-  // CHECK: "tf.XlaSendTPUEmbeddingGradients"(%arg0, %[[DATA]]) {config = {{.*}}, operandSegmentSizes = array<i32: 1, 0, 1>} : (tensor<512x256xf32>, tensor<!tf_type.variant>) -> ()
+  // CHECK: %[[DATA:.*]] = "tf.XlaRecvTPUEmbeddingDeduplicationData"() <{config = {{.*}}}> : () -> tensor<!tf_type.variant>
+  // CHECK: "tf.XlaSendTPUEmbeddingGradients"(%arg0, %[[DATA]]) <{config = {{.*}}, operandSegmentSizes = array<i32: 1, 0, 1>}> : (tensor<512x256xf32>, tensor<!tf_type.variant>) -> ()
   // CHECK-NOT: tf.SendTPUEmbeddingGradients
   // CHECK-NOT: tf.RecvTPUEmbeddingActivations
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 45d5ba99fc5dea..8e9218354bad4d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -131,7 +131,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-SAME: -> tensor<1x2x3xf32>
   func.func @shape_from_if_to_region_bodies_to_output(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
     %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
-    %0 = "tf.IfRegion"(%arg0) ({
+    // CHECK: <{is_stateless = true}>
+    %0 = "tf.IfRegion"(%arg0) <{is_stateless = true}> ({
       // CHECK: "tf.Add"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
       // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
       %1 = "tf.Add"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
@@ -141,8 +142,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
       %2 = "tf.Sub"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
       "tf.Yield"(%2) : (tensor<*xf32>) -> ()
-      // CHECK: {is_stateless = true} : (tensor<i1>) -> tensor<1x2x3xf32>
-     }) {is_stateless = true} : (tensor<i1>) -> tensor<*xf32>
+      // CHECK: (tensor<i1>) -> tensor<1x2x3xf32>
+     }) : (tensor<i1>) -> tensor<*xf32>
     // CHECK: return {{.*}} :  tensor<1x2x3xf32>
     func.return %0 : tensor<*xf32>
   }
@@ -176,7 +177,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-SAME: -> tensor<1x2x3xf32>
   func.func @shape_from_case_to_region_bodies_to_output(%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
     %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
-    %0 = "tf.CaseRegion"(%arg0) ({
+    // CHECK: <{is_stateless = true}>
+    %0 = "tf.CaseRegion"(%arg0) <{is_stateless = true}> ({
       // CHECK: "tf.Add"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
       // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
       %1 = "tf.Add"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
@@ -186,8 +188,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
       %2 = "tf.Sub"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
       "tf.Yield"(%2) : (tensor<*xf32>) -> ()
-      // CHECK: {is_stateless = true} : (tensor<i32>) -> tensor<1x2x3xf32>
-     }) {is_stateless = true} : (tensor<i32>) -> tensor<*xf32>
+      // CHECK: (tensor<i32>) -> tensor<1x2x3xf32>
+     }) : (tensor<i32>) -> tensor<*xf32>
     // CHECK: return {{.*}} :  tensor<1x2x3xf32>
     func.return %0 : tensor<*xf32>
   }
@@ -243,7 +245,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @shape_from_while_operands_to_cond_body_to_while_results(%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) ->  tensor<*xf32> {
     %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
     // CHECK: "tf.WhileRegion"
-    %0:2 = "tf.WhileRegion"(%arg0, %unshaped) ({
+    // CHECK: <{is_stateless = true}>
+    %0:2 = "tf.WhileRegion"(%arg0, %unshaped) <{is_stateless = true}> ({
        // CHECK: {{.*}}({{.+}}: tensor<i32>, {{.+}}: tensor<1x2x3xf32>):
        ^bb0(%carg0: tensor<i32>, %carg1: tensor<*xf32>):
          %limit = arith.constant dense<5> : tensor<i32>
@@ -258,8 +261,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         %neg = "tf.Neg"(%barg1) : (tensor<*xf32>) -> tensor<*xf32>
         // CHECK: "tf.Yield"{{.+}}, {{.+}}) : (tensor<i32>, tensor<1x2x3xf32>) -> ()
         "tf.Yield"(%sub, %neg) : (tensor<i32>, tensor<*xf32>) -> ()
-    // CHECK: {is_stateless = true} : (tensor<i32>, tensor<1x2x3xf32>) -> (tensor<i32>, tensor<1x2x3xf32>)
-    }) {is_stateless = true} : (tensor<i32>, tensor<*xf32>) -> (tensor<i32>, tensor<*xf32>)
+    // CHECK: (tensor<i32>, tensor<1x2x3xf32>) -> (tensor<i32>, tensor<1x2x3xf32>)
+    }) : (tensor<i32>, tensor<*xf32>) -> (tensor<i32>, tensor<*xf32>)
     // CHECK: return {{.+}}#1 : tensor<1x2x3xf32>
     func.return %0#1 : tensor<*xf32>
   }
@@ -752,7 +755,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
   // CHECK-LABEL: replace_tensor_list_element_shape
   func.func @replace_tensor_list_element_shape() {
-    // CHECK: %[[ELEMENT_SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>}
+    // CHECK: %[[ELEMENT_SHAPE:.*]] = "tf.Const"() <{value = dense<[-1, 1]> : tensor<2xi32>}>
     %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
     %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
     %elem = "tf._SomeOp"() : () -> tensor<16x1xf32>
@@ -767,7 +770,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     "tf._SomeOtherOp"(%shape_32, %shape_64) : (tensor<?xi32>, tensor<?xi64>) -> ()
     func.return
   }
-  
+
   // CHECK-LABEL: refine_pop_back_results_from_operands
   func.func @refine_pop_back_results_from_operands(%arg0: tensor<!tf_type.variant<tensor<2xi32>>>, %arg1: tensor<1xi32>) -> (tensor<!tf_type.variant>, tensor<*xi32>)  {
     %0, %1 = "tf.TensorListPopBack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant>, tensor<*xi32>)
@@ -876,7 +879,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       tf_device.return %2 : tensor<1x8x2xf32>
     // CHECK: () -> tensor<1x8x2xf32>
     }) {device = "/device:CPU:0"} : () -> tensor<*xf32>
-    // CHECK: "tf.Cast"(%{{.*}}) {Truncate = false} : (tensor<1x8x2xf32>) -> tensor<*xf32>
+    // CHECK: "tf.Cast"(%{{.*}}) <{Truncate = false}> : (tensor<1x8x2xf32>) -> tensor<*xf32>
     // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
     %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
     %4 = tensor.cast %1 : tensor<*xf32> to tensor<?x?x?xf32>
@@ -891,7 +894,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       tf_device.return %2 : tensor<1x8x2xf32>
     // CHECK: () -> tensor<1x8x2xf32>
     }) : () -> tensor<*xf32>
-    // CHECK: "tf.Cast"(%{{.*}}) {Truncate = false} : (tensor<1x8x2xf32>) -> tensor<*xf32>
+    // CHECK: "tf.Cast"(%{{.*}}) <{Truncate = false}> : (tensor<1x8x2xf32>) -> tensor<*xf32>
     // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
     %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
     %4 = tensor.cast %1 : tensor<*xf32> to tensor<?x?x?xf32>
@@ -980,7 +983,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %0:4 = "tf.While"(%arg0, %arg1, %arg2, %arg3) {cond = @while_shape_invariant_cond_func_propagate, body = @while_shape_invariant_body_func_propagate, is_stateless = false, shape_invariant} : (tensor<4xf32>, tensor<!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<1xi32>) -> (tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>)
 
     // CHECK: "tf.WhileRegion"
-    %1:4 = "tf.WhileRegion"(%arg0, %arg1, %arg2, %arg3) ({
+    // CHECK-SAME: shape_invariant
+    %1:4 = "tf.WhileRegion"(%arg0, %arg1, %arg2, %arg3) <{is_stateless = false, shape_invariant}> ({
     // CHECK-NEXT: ^{{.+}}({{%.+}}: tensor<*xf32>, {{%.+}}: tensor<*x!tf_type.resource<tensor<4xf32>>>, {{%.+}}: tensor<!tf_type.resource<tensor<8xf32>>>, {{%.+}}: tensor<?xi32>):
     ^cond(%carg0: tensor<*xf32>, %carg1: tensor<*x!tf_type.resource>, %carg2: tensor<!tf_type.resource>, %carg3: tensor<?xi32>):
       %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
@@ -992,10 +996,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK: "tf.Yield"
       // CHECK-SAME: (tensor<*xf32>, tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<?xi32>) -> ()
       "tf.Yield"(%barg0, %barg1, %barg2, %2) : (tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>) -> ()
-    // CHECK-NEXT: shape_invariant
-    // CHECK-SAME: (tensor<4xf32>, tensor<!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<1xi32>)
+    // CHECK-NEXT: (tensor<4xf32>, tensor<!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<1xi32>)
     // CHECK-SAME: -> (tensor<*xf32>, tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<?xi32>)
-    }) {is_stateless = false, shape_invariant} : (tensor<4xf32>, tensor<!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<1xi32>) -> (tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>)
+    }) : (tensor<4xf32>, tensor<!tf_type.resource<tensor<4xf32>>>, tensor<!tf_type.resource<tensor<8xf32>>>, tensor<1xi32>) -> (tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>)
 
     func.return %0#0, %0#1, %0#2, %0#3, %1#0, %1#1, %1#2, %1#3 : tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>, tensor<*xf32>, tensor<*x!tf_type.resource>, tensor<!tf_type.resource>, tensor<?xi32>
   }
@@ -1028,7 +1031,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %0 = "tf.While"(%arg0) {cond = @while_shape_invariant_cond_func_different_dims, body = @while_shape_invariant_body_func_different_dims, is_stateless = false, shape_invariant} : (tensor<1x2x3xf32>) -> tensor<1x8x3xf32>
 
     // CHECK: "tf.WhileRegion"
-    %1 = "tf.WhileRegion"(%arg0) ({
+    // CHECK-SAME: shape_invariant
+    %1 = "tf.WhileRegion"(%arg0) <{is_stateless = false, shape_invariant}> ({
     // CHECK-NEXT: ^{{.+}}({{%.+}}: tensor<1x?x3xf32>):
     ^cond(%carg0: tensor<*xf32>):
       %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
@@ -1040,10 +1044,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK: "tf.Yield"
       // CHECK-SAME: (tensor<1x?x3xf32>) -> ()
       "tf.Yield"(%2) : (tensor<*xf32>) -> ()
-    // CHECK-NEXT: shape_invariant
-    // CHECK-SAME: (tensor<1x2x3xf32>)
+    // CHECK-NEXT: (tensor<1x2x3xf32>)
     // CHECK-SAME: -> tensor<1x8x3xf32>
-    }) {is_stateless = false, shape_invariant} : (tensor<1x2x3xf32>) -> tensor<1x8x3xf32>
+    }) : (tensor<1x2x3xf32>) -> tensor<1x8x3xf32>
 
     func.return %0, %1 : tensor<1x8x3xf32>, tensor<1x8x3xf32>
   }
@@ -1076,7 +1079,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %0 = "tf.While"(%arg0) {cond = @while_shape_invariant_cond_func_body_result_propagate, body = @while_shape_invariant_body_func_body_result_propagate, is_stateless = false, shape_invariant} : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
 
     // CHECK: "tf.WhileRegion"
-    %1 = "tf.WhileRegion"(%arg0) ({
+    // CHECK-SAME: shape_invariant
+    %1 = "tf.WhileRegion"(%arg0) <{is_stateless = false, shape_invariant}> ({
     // CHECK-NEXT: ^{{.+}}({{%.+}}: tensor<*x!tf_type.resource<tensor<f32>>>):
     ^cond(%carg0: tensor<*x!tf_type.resource<tensor<f32>>>):
       %2 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
@@ -1088,10 +1092,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK: "tf.Yield"
       // CHECK-SAME: (tensor<*x!tf_type.resource<tensor<f32>>>) -> ()
       "tf.Yield"(%2) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> ()
-    // CHECK-NEXT: shape_invariant
-    // CHECK-SAME: (tensor<*x!tf_type.resource<tensor<f32>>>)
+    // CHECK-NEXT: (tensor<*x!tf_type.resource<tensor<f32>>>)
     // CHECK-SAME: -> tensor<*x!tf_type.resource<tensor<f32>>>
-    }) {is_stateless = false, shape_invariant} : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
+    }) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource>
 
     func.return %0, %1 : tensor<*x!tf_type.resource>, tensor<*x!tf_type.resource>
   }
@@ -1333,7 +1336,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %cst = "tf.Const"() {value = dense<0> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
     %cst_0 = "tf.Const"() {value = dense<[2, 2, 1, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
     %cst_1 = "tf.Const"() {value = dense<[2, 3, 1, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-    // CHECK: %0 = "tf.XlaSelectAndScatter"(%arg0, %cst_1, %cst_0, %cst, %arg1, %arg2) {scatter = @add_scatter, select = @ge_select} : (tensor<4x5x1x1xbf16>, tensor<4xi32>, tensor<4xi32>, tensor<4x2xi32>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<4x5x1x1xbf16>
+    // CHECK: %0 = "tf.XlaSelectAndScatter"(%arg0, %cst_1, %cst_0, %cst, %arg1, %arg2) <{scatter = @add_scatter, select = @ge_select}> : (tensor<4x5x1x1xbf16>, tensor<4xi32>, tensor<4xi32>, tensor<4x2xi32>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<4x5x1x1xbf16>
     %0 = "tf.XlaSelectAndScatter"(%arg0, %cst_1, %cst_0, %cst, %arg1, %arg2) {scatter = @add_scatter, select = @ge_select} : (tensor<4x5x1x1xbf16>, tensor<4xi32>, tensor<4xi32>, tensor<4x2xi32>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<?x?x?x?xbf16>
     func.return %0 : tensor<?x?x?x?xbf16>
   }
@@ -1373,7 +1376,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %cst_1 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
     %cst_2 = "tf.Const"() {value = dense<3> : tensor<1xi32>} : () -> tensor<1xi32>
     %cst_3 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-    // CHECK: 0 = "tf.XlaReduceWindow"(%arg0, %arg1, %cst_0, %cst_1, %cst_2, %cst_3, %cst) {computation = @sum_reducer3} : (tensor<7xf32>, tensor<f32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<10xf32>
+    // CHECK: 0 = "tf.XlaReduceWindow"(%arg0, %arg1, %cst_0, %cst_1, %cst_2, %cst_3, %cst) <{computation = @sum_reducer3}> : (tensor<7xf32>, tensor<f32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<10xf32>
     %0 = "tf.XlaReduceWindow"(%arg0, %arg1, %cst_0, %cst_1, %cst_2, %cst_3, %cst) {computation = @sum_reducer3} : (tensor<7xf32>, tensor<f32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<?xf32>
     func.return %0 : tensor<?xf32>
   }
@@ -1813,7 +1816,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @infer_var_handle_op_from_assigns() -> tensor<1xi8> {
     %cst = arith.constant dense<1> : tensor<1xi8>
     %0 = "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<*xi8>>>
-    // CHECK: "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<1xi8>>>
+    // CHECK: "tf.VarHandleOp"() <{container = "", shared_name = "bar"}> : () -> tensor<!tf_type.resource<tensor<1xi8>>>
     "tf.AssignVariableOp"(%0, %cst) : (tensor<!tf_type.resource<tensor<*xi8>>>, tensor<1xi8>) -> ()
     func.return %cst : tensor<1xi8>
   }
@@ -1822,7 +1825,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @infer_var_handle_op_from_read() -> tensor<1xi8> {
     %cst = arith.constant dense<1> : tensor<1xi8>
     %0 = "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<*xi8>>>
-    // CHECK: "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<1xi8>>>
+    // CHECK: "tf.VarHandleOp"() <{container = "", shared_name = "bar"}> : () -> tensor<!tf_type.resource<tensor<1xi8>>>
     %read = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<*xi8>>>) -> tensor<1xi8>
     func.return %read : tensor<1xi8>
   }
@@ -1831,7 +1834,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @do_not_infer_var_handle_op_when_custom_op_uses_it() -> tensor<1xi8> {
     %cst = arith.constant dense<1> : tensor<1xi8>
     %0 = "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<*xi8>>>
-    // CHECK: "tf.VarHandleOp"() {container = "", shared_name = "bar"} : () -> tensor<!tf_type.resource<tensor<*xi8>>>
+    // CHECK: "tf.VarHandleOp"() <{container = "", shared_name = "bar"}> : () -> tensor<!tf_type.resource<tensor<*xi8>>>
     %read = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<*xi8>>>) -> tensor<1xi8>
     %1 = "tf.MyCustomOp"(%0) : (tensor<!tf_type.resource<tensor<*xi8>>>) -> tensor<4xi8>
     func.return %read : tensor<1xi8>
@@ -1916,7 +1919,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x?x?x?x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x?x?x?x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x?x?x?x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     func.return %0 : tensor<8x4x14x14x16xf32>
   }
@@ -1928,7 +1931,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<?x?x?x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x4x16x16x16xf32>, tensor<?x?x?x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<?x?x?x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     func.return %0 : tensor<8x4x14x14x16xf32>
   }
@@ -1941,7 +1944,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %cst_2 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
     %cst_3 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %0 = tf_executor.graph {
-      // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst, %cst_0, %cst_1, %cst_2, %cst_3) {_XlaHasReferenceVars = false, device = "/job:localhost/replica:0/task:0/device:XLA_CPU:0", dimension_numbers = "\18\012\01\02@\01P\01Z\01\02b\01\02", precision_config = "\0A\02\01\01"} : (tensor<*xf32>, tensor<*xf32>, tensor<1xi32>, tensor<1x2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<*xf32>
+      // CHECK: "tf.XlaConvV2"(%arg0, %arg1, %cst, %cst_0, %cst_1, %cst_2, %cst_3) <{dimension_numbers = "\18\012\01\02@\01P\01Z\01\02b\01\02", precision_config = "\0A\02\01\01"}> {_XlaHasReferenceVars = false, device = "/job:localhost/replica:0/task:0/device:XLA_CPU:0"} : (tensor<*xf32>, tensor<*xf32>, tensor<1xi32>, tensor<1x2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<*xf32>
       %outputs, %control = tf_executor.island wraps "tf.XlaConvV2"(%arg0, %arg1, %cst, %cst_0, %cst_1, %cst_2, %cst_3) {_XlaHasReferenceVars = false, device = "/job:localhost/replica:0/task:0/device:XLA_CPU:0", dimension_numbers = "\18\012\01\02@\01P\01Z\01\02b\01\02", precision_config = "\0A\02\01\01"} : (tensor<*xf32>, tensor<*xf32>, tensor<1xi32>, tensor<1x2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<*xf32>
       tf_executor.fetch %outputs : tensor<*xf32>
     }
@@ -1955,7 +1958,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     func.return %0 : tensor<8x4x14x14x16xf32>
   }
@@ -1967,7 +1970,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<?x?x?x?x?xf32>
     func.return %0 : tensor<?x?x?x?x?xf32>
   }
@@ -1979,7 +1982,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf16>, tensor<4x3x3x16x16xf16>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x4x16x16x16xf16>, tensor<4x3x3x16x16xf16>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf16>, tensor<4x3x3x16x16xf16>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<?x?x?x?x?xf32>
     func.return %0 : tensor<?x?x?x?x?xf32>
   }
@@ -1991,7 +1994,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
     %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
     %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
-    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi64>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+    // CHECK: %0 = "tf.XlaConvV2"(%arg0, %arg1, %cst_3, %cst_2, %cst_1, %cst_0, %cst) <{dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""}> : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi64>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
     %0 = "tf.XlaConvV2"(%lhs, %rhs, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi64>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<?x?x?x?x?xf32>
     func.return %0 : tensor<?x?x?x?x?xf32>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
index 7307bff6e69914..4428811b7df371 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
@@ -116,7 +116,7 @@ func.func @dangling_print(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*
 // CHECK:  %[[GRAPH:.*]]:2 = tf_executor.graph {
 // CHECK:    %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
 // CHECK:    %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1)
-// CHECK:    %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) {message = "add result"}
+// CHECK:    %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) <{message = "add result"}>
 // CHECK:    tf_executor.fetch %[[ADD1]], %[[ADD2]] :
 // CHECK:  }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
@@ -186,7 +186,7 @@ func.func @non_aliasing_reads_writes(
 // CHECK:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg0)
 // CHECK:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2)
 // CHECK:   %[[READ1:.*]], %[[READ1_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg1)
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
 // CHECK:   %[[READ2:.*]], %[[READ2_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN1_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %[[READ0]])
 // CHECK:   %[[ASSIGN2_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %[[READ2]])
@@ -214,8 +214,8 @@ func.func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () {
 
 // CHECK-LABEL: func @unknown_side_effecting_op
 // CHECK: tf_executor.graph {
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
-// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v1"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
+// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v1"}>
 // CHECK:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VH1]], %arg0)
 // CHECK:   %[[UNKNOWN_CONTROL:.*]] = tf_executor.island wraps "tf._UnknownSideEffectingOp_"()
@@ -432,4 +432,4 @@ func.func @else_function() {
     tf_executor.fetch
   }
   func.return
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
index ee663e98ae9674..907d512b2571b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
@@ -4,15 +4,15 @@
 
 // CHECK-LABEL: func @main
 func.func @main() -> tensor<f32> {
-  // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK-NEXT: "tf.Const"() <{value = dense<10> : tensor<i32>}>
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
-  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
   // CHECK-NEXT: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<10xf32>>>
   // CHECK-NEXT: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<1xi32>>>
-  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[ZERO]])
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[BROADCAST]])
   %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf_type.resource>
@@ -21,22 +21,22 @@ func.func @main() -> tensor<f32> {
   %elem = "tf._SomeOp"() : () -> tensor<f32>
   // CHECK-NEXT: %[[READ_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
   // CHECK-NEXT: %[[READ_SIZE:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
-  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[PUSHVAL]], %[[UPDATE_SHAPE]]) : (tensor<f32>, tensor<1xi32>) -> tensor<1xf32>
   // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_VAL]], %[[UPDATE_SLICE]], %[[READ_SIZE]]) : (tensor<10xf32>, tensor<1xf32>, tensor<1xi32>) -> tensor<10xf32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]]) : (tensor<!tf_type.resource<tensor<10xf32>>>, tensor<10xf32>) -> ()
-  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[NEW_SIZE:.*]] = "tf.AddV2"(%[[READ_SIZE]], %[[CONST1]]) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[NEW_SIZE]]) : (tensor<!tf_type.resource<tensor<1xi32>>>, tensor<1xi32>) -> ()
   %push = "tf.StackPushV2"(%id, %elem) {swap_memory = false} : (tensor<!tf_type.resource>, tensor<f32>) -> tensor<f32>
   %pop = "tf.StackPopV2"(%stack) : (tensor<!tf_type.resource>) -> tensor<f32>
   // CHECK-NEXT: %[[READ_VAL1:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
   // CHECK-NEXT: %[[READ_SIZE1:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
-  // CHECK-NEXT: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST1_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SUB:.*]] = "tf.Sub"(%[[READ_SIZE1]], %[[CONST1_1]])
-  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[READ_VAL1]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[SUB]]) : (tensor<!tf_type.resource<tensor<1xi32>>>, tensor<1xi32>) -> ()
   "tf.StackCloseV2"(%stack) : (tensor<!tf_type.resource>) -> ()
@@ -50,14 +50,14 @@ func.func @main() -> tensor<f32> {
 
 // CHECK-LABEL: func @main
 func.func @main() -> tensor<2xi32> {
-  // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: "tf.Const"() <{value = dense<10> : tensor<i32>}> : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[ZERO_CONST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[STACK_SHAPE:.*]] = "tf.Const"() {value = dense<[10, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-NEXT: %[[STACK_SHAPE:.*]] = "tf.Const"() <{value = dense<[10, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[ZERO_CONST]], %[[STACK_SHAPE]]) : (tensor<i32>, tensor<2xi32>) -> tensor<10x2xi32>
   // CHECK-NEXT: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<10x2xi32>>>
   // CHECK-NEXT: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<1xi32>>>
-  // CHECK-NEXT: %[[ZERO_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[ZERO_SIZE:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[ZERO_SIZE]]) : (tensor<!tf_type.resource<tensor<1xi32>>>, tensor<1xi32>) -> ()
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[BROADCAST]]) : (tensor<!tf_type.resource<tensor<10x2xi32>>>, tensor<10x2xi32>) -> ()
   %stack = "tf.StackV2"(%size) {elem_type = i32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf_type.resource>
@@ -65,14 +65,14 @@ func.func @main() -> tensor<2xi32> {
   %elem = "tf._SomeOp"() : () -> tensor<2xi32>
   // CHECK-NEXT: %[[STACK_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]]) : (tensor<!tf_type.resource<tensor<10x2xi32>>>) -> tensor<10x2xi32>
   // CHECK-NEXT: %[[STACK_SIZE:.*]] = "tf.ReadVariableOp"(%[[SIZE]]) : (tensor<!tf_type.resource<tensor<1xi32>>>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[PUSH_VAL]], %[[UPDATE_SHAPE]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
-  // CHECK-NEXT: %[[ZERO_INDS:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-NEXT: %[[CONCAT_DIM:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_INDS:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONCAT_DIM:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: %[[CONCAT_OFFETS:.*]] = "tf.ConcatV2"(%[[STACK_SIZE]], %[[ZERO_INDS]], %[[CONCAT_DIM]]) : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
   // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[STACK_VAL]], %[[UPDATE_SLICE]], %[[CONCAT_OFFETS]]) : (tensor<10x2xi32>, tensor<1x2xi32>, tensor<2xi32>) -> tensor<10x2xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]]) : (tensor<!tf_type.resource<tensor<10x2xi32>>>, tensor<10x2xi32>) -> ()
-  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[NEW_SIZE:.*]] = "tf.AddV2"(%[[STACK_SIZE]], %[[CONST1]]) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[NEW_SIZE]]) : (tensor<!tf_type.resource<tensor<1xi32>>>, tensor<1xi32>) -> ()
   %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf_type.resource>, tensor<2xi32>) -> tensor<2xi32>
@@ -102,7 +102,7 @@ func.func @main() -> () {
 }
 // CHECK: func @while_body(%[[BARG0:.*]]: tensor<!tf_type.resource<tensor<10xf32>>>, %[[BARG1:.*]]: tensor<i32>, %[[BARG2:.*]]: tensor<!tf_type.resource<tensor<1xi32>>>)
 func.func @while_body(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> (tensor<!tf_type.resource>, tensor<i32>) {
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG1]], %[[CONST1]])
   %sub = "tf.Sub"(%arg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -143,7 +143,7 @@ func.func @main() -> () {
   }, {
     // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>
     ^bb0(%barg0: tensor<i32>):
-    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG0]], %[[CONST1]])
     %sub = "tf.Sub"(%barg0, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -185,7 +185,7 @@ func.func @main(%arg0: tensor<i32>) -> () {
   // CHECK: tf.AssignVariableOp
   // CHECK: tf.AssignVariableOp
   %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf_type.resource>
-  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[BRANCH_INDEX]]) ({
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[BRANCH_INDEX]]) {{.*}} ({
   %case_op = "tf.CaseRegion"(%arg0) ({
     %elem = "tf._SomeOp"() : () -> tensor<f32>
     // CHECK-NOT: tf.StackPushV2
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
index 06ecfd401328ed..6adc432958e06a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
@@ -10,9 +10,9 @@ func.func @main() -> tensor<3xf32> {
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"(%[[VAR]], %[[BUFFER]])
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
-  // CHECK: %[[IND:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[IND:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[VAL:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>} : () -> tensor<3xf32>
+  // CHECK: %[[VAL:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>}> : () -> tensor<3xf32>
   %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
   // CHECK: %[[READ_VAR:.*]] = "tf.ReadVariableOp"(%[[VAR]])
   // CHECK: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[VAL]]
@@ -46,7 +46,7 @@ func.func @main() -> tensor<i32> {
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
   %write = "tf.TensorArrayWriteV3"(%ta#0, %index, %value, %ta#1) : (tensor<!tf_type.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size_out = "tf.TensorArraySizeV3"(%ta#0, %write) : (tensor<!tf_type.resource>, tensor<f32>) -> tensor<i32>
   // CHECK: return %[[SIZE]] : tensor<i32>
   func.return %size_out : tensor<i32>
@@ -110,7 +110,7 @@ func.func @main() -> () {
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf_type.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: %[[CONCAT_RESHAPE:.*]] = "tf.Reshape"(%[[READ]],
   // CHECK-SAME: -> tensor<15xf32>
-  // CHECK: %[[LENS:.*]] = "tf.Const"() {value = dense<3> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK: %[[LENS:.*]] = "tf.Const"() <{value = dense<3> : tensor<5xi64>}> : () -> tensor<5xi64>
   %concat:2 = "tf.TensorArrayConcatV3"(%ta#0, %ta#1) {element_shape_except0 = #tf_type.shape<*>} : (tensor<!tf_type.resource>, tensor<f32>) -> (tensor<*xf32>, tensor<*xi64>)
   // CHECK: %[[SPLIT_RESHAPE:.*]] = "tf.Reshape"(%[[CONCAT_RESHAPE]],
   // CHECK-SAME: -> tensor<5x3xf32>
@@ -153,33 +153,33 @@ func.func @main() -> () {
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
-  // CHECK: %[[INDS:.*]] = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[INDS:.*]] = "tf.Const"() <{value = dense<[2, 1]> : tensor<2xi32>}> : () -> tensor<2xi32>
   %indices = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf_type.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[READ]], %[[INDS]], %[[AXIS]]) : (tensor<5x3xf32>, tensor<2xi32>, tensor<i32>) -> tensor<2x3xf32>
   %gather = "tf.TensorArrayGatherV3"(%ta#0, %indices, %ta#1) {element_shape = #tf_type.shape<*>} : (tensor<!tf_type.resource>, tensor<2xi32>, tensor<f32>) -> tensor<*xf32>
   // CHECK: %[[READ2:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf_type.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
-  // CHECK-DAG: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-DAG: %[[IND_SLICE0_START:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[IND_SLICE0_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-DAG: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<[1, 3]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  // CHECK-DAG: %[[IND_SLICE0_START:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[IND_SLICE0_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[IND_SLICE0:.*]] = "tf.Slice"(%[[INDS]], %[[IND_SLICE0_START]], %[[IND_SLICE0_SIZE]]) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK: %[[SLICE0_START:.*]] = "tf.ConcatV2"(%[[IND_SLICE0]],
   // CHECK: %[[OLD_SLICE0:.*]] = "tf.Slice"(%[[READ2]], %[[SLICE0_START]],
   // CHECK-SAME: (tensor<5x3xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x3xf32>
-  // CHECK: %[[UPDATE_SLICE0_START:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[UPDATE_SLICE0_START:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: %[[UPDATE_SLICE0:.*]] = "tf.Slice"(%[[GATHER]], %[[UPDATE_SLICE0_START]], %[[SLICE_SIZE]]) : (tensor<2x3xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x3xf32>
   // CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[OLD_SLICE0]], %[[UPDATE_SLICE0]])
   // CHECK: %[[UPDATE0:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ2]], %[[ADD0]]
   // CHECK-SAME: (tensor<5x3xf32>, tensor<1x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
 
-  // CHECK-DAG: %[[IND_SLICE1_START:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-DAG: %[[IND_SLICE1_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-DAG: %[[IND_SLICE1_START:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK-DAG: %[[IND_SLICE1_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[IND_SLICE1:.*]] = "tf.Slice"(%[[INDS]], %[[IND_SLICE1_START]], %[[IND_SLICE1_SIZE]]) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK: %[[SLICE1_START:.*]] = "tf.ConcatV2"(%[[IND_SLICE1]],
   // CHECK: %[[OLD_SLICE1:.*]] = "tf.Slice"(%[[UPDATE0]], %[[SLICE1_START]],
   // CHECK-SAME: (tensor<5x3xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x3xf32>
-  // CHECK: %[[UPDATE_SLICE1_START:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[UPDATE_SLICE1_START:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: %[[UPDATE_SLICE1:.*]] = "tf.Slice"(%[[GATHER]], %[[UPDATE_SLICE1_START]], %[[SLICE_SIZE]]) : (tensor<2x3xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x3xf32>
   // CHECK: %[[ADD1:.*]] = "tf.AddV2"(%[[OLD_SLICE1]], %[[UPDATE_SLICE1]])
   // CHECK: %[[UPDATE1:.*]] = "tf.XlaDynamicUpdateSlice"(%[[UPDATE0]], %[[ADD1]]
@@ -200,7 +200,7 @@ func.func @main() {
   // CHECK: "tf.AssignVariableOp"(%[[VAR]],
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[VALUE:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>} : () -> tensor<3xf32>
+  // CHECK: %[[VALUE:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf32>}> : () -> tensor<3xf32>
   %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
   // CHECK: %[[GVAR1:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
   // CHECK: "tf.AssignVariableOp"(%[[GVAR1]],
@@ -240,7 +240,7 @@ func.func @main() {
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
@@ -257,7 +257,7 @@ func.func @main() -> () {
 }
 // CHECK: func @while_body(%[[BARG0:.*]]: tensor<!tf_type.resource<tensor<5x3xf32>>>, %[[BARG1:.*]]: tensor<i32>, %[[BARG2:.*]]: tensor<!tf_type.resource<tensor<5x3xf32>>>)
 func.func @while_body(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> (tensor<!tf_type.resource>, tensor<i32>) {
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG1]], %[[CONST1]])
   %sub = "tf.Sub"(%arg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -288,7 +288,7 @@ func.func @while_cond(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> t
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
@@ -328,7 +328,7 @@ func.func @then_branch(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.reso
 }
 // CHECK: func @else_branch(%[[EARG0:.*]]: tensor<!tf_type.resource<tensor<5x3xf32>>>, %[[EARG1:.*]]: tensor<!tf_type.resource<tensor<5x3xf32>>>, %[[EARG2:.*]]: tensor<!tf_type.resource<tensor<5x3xf32>>>)
 func.func @else_branch(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.resource> {
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %elem = "tf._SomeOp"() : () -> tensor<3xf32>
   %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
@@ -348,14 +348,14 @@ func.func @else_branch(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.reso
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>}
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}>
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
   // CHECK-NOT: tf.TensorArrayV3
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
-  // CHECK: %[[FLOW_INIT:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
-  // CHECK: %[[WHILE:.*]]:2 = "tf.WhileRegion"(%[[FLOW_INIT]], %[[SIZE]]) ({
+  // CHECK: %[[FLOW_INIT:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}>
+  // CHECK: %[[WHILE:.*]]:2 = "tf.WhileRegion"(%[[FLOW_INIT]], %[[SIZE]]) {{.*}} ({
   %while:2 = "tf.WhileRegion"(%ta#1, %size) ({
   // CHECK: ^bb0(%[[BARG0:.*]]: tensor<f32>, %[[BARG1:.*]]: tensor<i32>):
   ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
@@ -402,8 +402,8 @@ func.func @main(%arg0: tensor<i1>) -> () {
   // CHECK: "tf.AssignVariableOp"(%[[TA_BUFFER]]
   // CHECK-NOT: tf.TensorArrayV3
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<3>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
-  // CHECK: "tf.IfRegion"(%[[PRED]]) ({
-  %case_op = "tf.IfRegion"(%arg0) ({
+  // CHECK: "tf.IfRegion"(%[[PRED]]) <{is_stateless = false}> ({
+  %case_op = "tf.IfRegion"(%arg0) <{is_stateless = false}> ({
       // CHECK: %[[TA_VAL:.*]] = "tf.ReadVariableOp"(%[[TA_BUFFER]])
       // CHECK: "tf.Slice"(%[[TA_VAL]]
       // CHECK-NOT: tf.TensorArrayReadV3
@@ -420,8 +420,8 @@ func.func @main(%arg0: tensor<i1>) -> () {
       %elem = "tf._SomeOp"() : () -> tensor<3xf32>
       %write = "tf.TensorArrayWriteV3"(%ta#0, %idx, %elem, %ta#1) : (tensor<!tf_type.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
       "tf.Yield"(%write) : (tensor<f32>) -> ()
-    // CHECK: }) {is_stateless = false} : (tensor<i1>) -> tensor<f32>
-    }) {is_stateless = false} : (tensor<i1>) -> tensor<f32>
+    // CHECK: }) : (tensor<i1>) -> tensor<f32>
+    }) : (tensor<i1>) -> tensor<f32>
   %idx = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NOT: tf.TensorArrayReadV3
   %read_val = "tf.TensorArrayReadV3"(%ta#0, %idx, %case_op) : (tensor<!tf_type.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
@@ -436,7 +436,7 @@ func.func @main(%arg0: tensor<i1>) -> () {
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
@@ -486,7 +486,7 @@ func.func @callee(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.resource>
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %index = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5x3xf32>>>
@@ -543,7 +543,7 @@ func.func @callee() -> tensor<i32> {
   // CHECK: "tf.MlirLocalVarOp"() : () -> tensor<!tf_type.resource<tensor<5xf32>>>
   // CHECK: "tf.AssignVariableOp"
   %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf_type.shape<>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf_type.resource>, tensor<f32>)
-  // CHECK: %[[SIZE:.*]] = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SIZE:.*]] = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
   %size_out = "tf.TensorArraySizeV3"(%ta#0, %ta#1) : (tensor<!tf_type.resource>, tensor<f32>) -> tensor<i32>
   // CHECK: return %[[SIZE]] : tensor<i32>
   func.return %size_out : tensor<i32>
@@ -553,7 +553,7 @@ func.func @callee() -> tensor<i32> {
 
 // CHECK-LABEL: func @main
 func.func @main() -> () {
-  // CHECK: "tf.PartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> tensor<*xf32>
+  // CHECK: "tf.PartitionedCall"() <{config = "", config_proto = "", executor_type = "", f = @callee}> : () -> tensor<*xf32>
   %call = "tf.PartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> (tensor<*xf32>)
   func.return
 }
@@ -567,7 +567,7 @@ func.func private @callee() -> (tensor<*xf32>) {
   // CHECK: "tf.AssignVariableOp"(%[[LOCAL_VAR]], %[[UPDATE]]) : (tensor<!tf_type.resource<tensor<5x3xf32>>>, tensor<5x3xf32>) -> ()
   %flow = "tf.TensorArrayWriteV3"(%ta#0, %index, %value, %ta#1) : (tensor<!tf_type.resource<tensor<*xf32>>>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
   // CHECK: %[[SLICE:.*]] = "tf.Slice"
-  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<3> : tensor<1xi32>}
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<3> : tensor<1xi32>}>
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
   %val = "tf.TensorArrayReadV3"(%ta#0, %index, %ta#1) : (tensor<!tf_type.resource<tensor<*xf32>>>, tensor<i32>, tensor<f32>) -> tensor<*xf32>
   // CHECK: %[[CAST:.*]] = tensor.cast %[[ELEM]] : tensor<3xf32> to tensor<*xf32>
@@ -604,7 +604,7 @@ func.func private @callee(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.r
   %flow = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK: %[[BR_INDEX:.*]] = "tf.SomeOp"() : () -> tensor<i32>
   %branch_index = "tf.SomeOp"() : () -> tensor<i32>
-  // CHECK: "tf.CaseRegion"(%[[BR_INDEX]]) ({
+  // CHECK: "tf.CaseRegion"(%[[BR_INDEX]]) {{.*}} ({
   "tf.CaseRegion"(%branch_index) ({
     // CHECK: %[[READ_GVAR:.*]] = "tf.ReadVariableOp"(%[[GVAR]])
     // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_GVAR]],
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 1177f9f6f9688c..6fb95988b66dd2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -4,34 +4,34 @@
 
 // CHECK-LABEL: func @main
 func.func @main() -> (tensor<f32>, tensor<i32>) {
-  // CHECK-NEXT: "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-NEXT: "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
-  // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK-NEXT: "tf.Const"() <{value = dense<10> : tensor<i32>}>
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
-  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
-  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
   %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<f32>>>
   %id = "tf.Identity"(%tl) : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant<tensor<f32>>>
   // CHECK-NEXT: %[[PUSHVAL:.*]] = "tf._SomeOp"()
   %elem = "tf._SomeOp"() : () -> tensor<f32>
-  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[PUSHVAL]], %[[UPDATE_SHAPE]]) : (tensor<f32>, tensor<1xi32>) -> tensor<1xf32>
   // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BROADCAST]], %[[UPDATE_SLICE]], %[[ZERO]]) : (tensor<10xf32>, tensor<1xf32>, tensor<1xi32>) -> tensor<10xf32>
-  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[NEW_SIZE:.*]] = "tf.AddV2"(%[[ZERO]], %[[CONST1]]) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   %push = "tf.TensorListPushBack"(%id, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
   // CHECK-NEXT: %[[COPY:.*]] = "tf.Identity"(%[[UPDATE]])
-  // CHECK-NEXT: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST1_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SUB:.*]] = "tf.Sub"(%[[NEW_SIZE]], %[[CONST1_1]])
-  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   %pop:2 = "tf.TensorListPopBack"(%push, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
-  // CHECK-NEXT: %[[SCALAR_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-NEXT: %[[SCALAR_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   // CHECK-NEXT: %[[LENGTH:.*]] = "tf.Reshape"(%[[NEW_SIZE]], %[[SCALAR_SHAPE]])
   %length = "tf.TensorListLength"(%push) : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<i32>
   // CHECK-NEXT: return %[[ELEM]], %[[LENGTH]] : tensor<f32>, tensor<i32>
@@ -46,30 +46,30 @@ func.func @main() -> (tensor<f32>, tensor<i32>) {
 // CHECK-LABEL: func @main
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>)
 func.func @main(%arg0: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>) {
-  // CHECK-NEXT: "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-NEXT: "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
-  // CHECK-NEXT: %[[NUM:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK-NEXT: %[[NUM:.*]] = "tf.Const"() <{value = dense<10> : tensor<i32>}>
   %num = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
-  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
-  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
   // CHECK-NEXT: %[[SIZE:.*]] = "tf.Reshape"(%[[NUM]], %[[SIZE_SHAPE]])
   %tl = "tf.TensorListReserve"(%elem_shape, %num) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<f32>>>
   // CHECK-NEXT: %[[SETVAL:.*]] = "tf._SomeOp"()
   %elem = "tf._SomeOp"() : () -> tensor<f32>
-  // CHECK-NEXT: %[[SIZE_SHAPE1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SIZE_SHAPE1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
   // CHECK-NEXT: %[[SET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE1]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[SETVAL]], %[[UPDATE_SHAPE]]) : (tensor<f32>, tensor<1xi32>) -> tensor<1xf32>
   // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BROADCAST]], %[[UPDATE_SLICE]], %[[SET_INDEX]]) : (tensor<10xf32>, tensor<1xf32>, tensor<1xi32>) -> tensor<10xf32>
   %set = "tf.TensorListSetItem"(%tl, %arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
-  // CHECK-NEXT: %[[SIZE_SHAPE2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SIZE_SHAPE2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
   // CHECK-NEXT: %[[GET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE2]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[UPDATE]], %[[GET_INDEX]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   %get = "tf.TensorListGetItem"(%set, %arg0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NEXT: %[[ADDN:.*]] = "tf.AddN"(%[[UPDATE]], %[[BROADCAST]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
@@ -79,7 +79,7 @@ func.func @main(%arg0: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>
   // CHECK-NEXT: %[[ADDN2:.*]] = "tf.AddN"(%[[ADDN]], %[[ZEROS_LIKE]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
   %addn2 = "tf.AddN"(%addn, %zeros-like) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_type.variant<tensor<f32>>>
   %stack = "tf.TensorListStack"(%addn2, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> tensor<10xf32>
-  // CHECK-NEXT: %[[LEN:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[LEN:.*]] = "tf.Const"() <{value = dense<10> : tensor<i32>}> : () -> tensor<i32>
   %length = "tf.TensorListLength"(%addn2) : (tensor<!tf_type.variant<tensor<f32>>>) -> tensor<i32>
   // CHECK-NEXT: return %[[ELEM]], %[[ADDN2]], %[[LEN]] : tensor<f32>, tensor<10xf32>, tensor<i32>
   func.return %get, %stack, %length : tensor<f32>, tensor<10xf32>, tensor<i32>
@@ -92,16 +92,16 @@ func.func @main(%arg0: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>
 // CHECK-LABEL: func @main
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<i32>, %[[ARG1:.*]]: tensor<10xf32>) -> tensor<f32>
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<10xf32>) -> tensor<f32> {
-  // CHECK-NEXT: "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-NEXT: "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NEXT: %[[BUFFER:.*]] = "tf.Identity"(%[[ARG1]]) : (tensor<10xf32>) -> tensor<10xf32>
-  // CHECK-NEXT: %[[SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SIZE:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}> : () -> tensor<1xi32>
   %tl = "tf.TensorListFromTensor"(%arg1, %elem_shape) : (tensor<10xf32>, tensor<0xi32>) -> tensor<!tf_type.variant<tensor<f32>>>
-  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
   // CHECK-NEXT: %[[GET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[BUFFER]], %[[GET_INDEX]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   %get = "tf.TensorListGetItem"(%tl, %arg0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NEXT: return %[[ELEM]] : tensor<f32>
@@ -116,7 +116,7 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<10xf32>) -> tensor<f32> {
 func.func @main(%arg0: tensor<10x8x9xf32>) -> tensor<2xi64> {
   %elem_shape = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi32>} : () -> tensor<2xi32>
   %tl = "tf.TensorListFromTensor"(%arg0, %elem_shape) : (tensor<10x8x9xf32>, tensor<2xi32>) -> tensor<!tf_type.variant<tensor<8x9xf32>>>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[8, 9]> : tensor<2xi64>}> : () -> tensor<2xi64>
   %shape = "tf.TensorListElementShape"(%tl) : (tensor<!tf_type.variant<tensor<8x9xf32>>>) -> tensor<2xi64>
   // CHECK-NEXT: return %[[SHAPE]] : tensor<2xi64>
   func.return %shape: tensor<2xi64>
@@ -132,7 +132,7 @@ func.func @main(%arg0: tensor<10x8x9xf32>, %arg1: tensor<3xi32>) -> tensor<3x8x9
   %elem_shape = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: %[[BUFFER:.*]] = "tf.Identity"(%[[ARG0]]) : (tensor<10x8x9xf32>) -> tensor<10x8x9xf32>
   %tl = "tf.TensorListFromTensor"(%arg0, %elem_shape) : (tensor<10x8x9xf32>, tensor<2xi32>) -> tensor<!tf_type.variant<tensor<8x9xf32>>>
-  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[BUFFER]], %[[ARG1]], %[[AXIS]]) : (tensor<10x8x9xf32>, tensor<3xi32>, tensor<i32>) -> tensor<3x8x9xf32>
   %gather = "tf.TensorListGather"(%tl, %arg1, %elem_shape) : (tensor<!tf_type.variant<tensor<8x9xf32>>>, tensor<3xi32>, tensor<2xi32>) -> tensor<3x8x9xf32>
   // CHECK-NEXT: return %[[GATHER]] : tensor<3x8x9xf32>
@@ -149,7 +149,7 @@ func.func @main(%arg0: tensor<10x8x9xf32>, %arg1: tensor<5xi32>, %arg2: tensor<5
   %elem_shape = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: %[[BUFFER:.*]] = "tf.Identity"(%[[ARG0]]) : (tensor<10x8x9xf32>) -> tensor<10x8x9xf32>
   %tl = "tf.TensorListFromTensor"(%arg0, %elem_shape) : (tensor<10x8x9xf32>, tensor<2xi32>) -> tensor<!tf_type.variant<tensor<8x9xf32>>>
-  // CHECK: %[[IND_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[IND_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 1]> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: %[[IND_RESHPE:.*]] = "tf.Reshape"(%[[ARG1]], %[[IND_SHAPE]]) : (tensor<5xi32>, tensor<2xi32>) -> tensor<5x1xi32>
   // CHECK: %[[SC:.*]] = "tf.TensorScatterUpdate"(%[[BUFFER]], %[[IND_RESHPE]], %[[ARG2]]) : (tensor<10x8x9xf32>, tensor<5x1xi32>, tensor<5x8x9xf32>) -> tensor<10x8x9xf32>
   %scatter = "tf.TensorListScatterIntoExistingList"(%tl, %arg2, %arg1) : (tensor<!tf_type.variant<tensor<8x9xf32>>>, tensor<5x8x9xf32>, tensor<5xi32>) -> tensor<!tf_type.variant<tensor<8x9xf32>>>
@@ -179,14 +179,14 @@ func.func @main() -> () {
 }
 // CHECK: func @while_body(%[[BARG0:.*]]: tensor<10xf32>, %[[BARG1:.*]]: tensor<i32>, %[[BARG2:.*]]: tensor<1xi32>)
 func.func @while_body(%arg0: tensor<!tf_type.variant<tensor<f32>>>, %arg1: tensor<i32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>) {
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG1]], %[[CONST1]])
   %sub = "tf.Sub"(%arg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %elem = "tf._SomeOp"() : () -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPushBack"
   // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BARG2]], %[[CONST1]])
   // CHECK-NOT: "tf.TensorListPushBack"
   %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
@@ -222,7 +222,7 @@ func.func @if_then(%arg0: tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_t
   %elem = "tf._SomeOp"() : () -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPushBack"
   // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TARG1]], %[[CONST1]])
   // CHECK-NOT: "tf.TensorListPushBack"
   %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
@@ -234,11 +234,11 @@ func.func @if_else(%arg0: tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_t
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NOT: "tf.TensorListPopBack"
   // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
-  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
-  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -269,7 +269,7 @@ func.func @branch_0(%arg0: tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_
   %elem = "tf._SomeOp"() : () -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPushBack"
   // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TARG1]], %[[CONST1]])
   // CHECK-NOT: "tf.TensorListPushBack"
   %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
@@ -281,11 +281,11 @@ func.func @branch_1(%arg0: tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NOT: "tf.TensorListPopBack"
   // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
-  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
-  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -297,11 +297,11 @@ func.func @branch_2(%arg0: tensor<!tf_type.variant<tensor<f32>>>) -> tensor<!tf_
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NOT: "tf.TensorListPopBack"
   // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
-  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
-  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
-  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}> : () -> tensor<0xi32>
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -317,7 +317,7 @@ func.func @main() -> tensor<f32> {
   %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NOT: tf.EmptyTensorList
   %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<f32>>>
-  %while_op:2 = "tf.WhileRegion"(%tl, %size) ({
+  %while_op:2 = "tf.WhileRegion"(%tl, %size) <{is_stateless = false}> ({
   // CHECK: ^bb0(%[[CARG0:.*]]: tensor<10xf32>, %[[CARG1:.*]]: tensor<i32>, %[[CARG2:.*]]: tensor<1xi32>):
   ^bb0(%arg0: tensor<!tf_type.variant<tensor<f32>>>, %arg1: tensor<i32>):
     // CHECK:   %[[PRED:.*]] = "tf._SomeOp"()
@@ -327,7 +327,7 @@ func.func @main() -> tensor<f32> {
   },  {
   // CHECK: ^bb0(%[[CARG0:.*]]: tensor<10xf32>, %[[CARG1:.*]]: tensor<i32>, %[[CARG2:.*]]: tensor<1xi32>):
   ^bb0(%arg0: tensor<!tf_type.variant<tensor<f32>>>, %arg1: tensor<i32>):
-    // CHECK:   %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK:   %[[CST:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
     // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[CARG1]], %[[CST]])
     // CHECK:   %[[ELEM:.*]] = "tf._SomeOp"() : () -> tensor<f32>
     %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
@@ -335,14 +335,14 @@ func.func @main() -> tensor<f32> {
     %elem = "tf._SomeOp"() : () -> tensor<f32>
     // CHECK-NOT: "tf.TensorListPushBack"
     // CHECK:   %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[CARG0]]
-    // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+    // CHECK:   %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
     // CHECK:   %[[ADD:.*]] = "tf.AddV2"(%[[CARG2]], %[[ONE]])
     // CHECK-NOT: "tf.TensorListPushBack"
     // CHECK:   "tf.Yield"(%[[UPDATE]], %[[SUB]], %[[ADD]])
-    // CHECK: }) {is_stateless = false}
+    // CHECK: })
     %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
     "tf.Yield"(%push, %sub) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>) -> ()
-  }) {is_stateless = false} : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>)
+  }) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<i32>)
   // CHECK: "tf.Slice"
   // CHECK-NOT: tf.TensorListPopBack
   %pop:2 = "tf.TensorListPopBack"(%while_op#0, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -356,27 +356,27 @@ func.func @main(%arg0: tensor<i1>) -> () {
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<f32>>>
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
   // CHECK: %[[ZERO_F32:.*]] = "tf.Cast"(%[[ZERO]])
-  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}>
   // CHECK: %[[BUFFER:.*]] = "tf.BroadcastTo"(%[[ZERO_F32]], %[[MAX_SIZE]])
-  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}>
   // CHECK-NOT: tf.EmptyTensorList
   %if_op = "tf.IfRegion"(%arg0) ({
       %elem = "tf._SomeOp"() : () -> tensor<f32>
       // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-      // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+      // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
       // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BUFFER_SIZE]], %[[ONE]])
       // CHECK-NOT: "tf.TensorListPushBack"
       %push = "tf.TensorListPushBack"(%tl, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
       "tf.Yield" (%push) : (tensor<!tf_type.variant<tensor<f32>>>) -> ()
     }, {
       // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
-      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
-      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
-      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
       // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
       // CHECK-NOT: "tf.TensorListPopBack"
       %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -397,28 +397,28 @@ func.func @main(%arg0: tensor<i1>) -> () {
 func.func @main(%arg0: tensor<i32>) -> () {
   %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
   // CHECK: %[[ZERO_F32:.*]] = "tf.Cast"(%[[ZERO]])
-  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}>
   // CHECK: %[[BUFFER:.*]] = "tf.BroadcastTo"(%[[ZERO_F32]], %[[MAX_SIZE]])
-  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}>
   // CHECK-NOT: tf.EmptyTensorList
   %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<f32>>>
   %case_op = "tf.CaseRegion"(%arg0) ({
       %elem = "tf._SomeOp"() : () -> tensor<f32>
       // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-      // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+      // CHECK: %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
       // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BUFFER_SIZE]], %[[ONE]])
       // CHECK-NOT: "tf.TensorListPushBack"
       %push = "tf.TensorListPushBack"(%tl, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
       "tf.Yield" (%push) : (tensor<!tf_type.variant<tensor<f32>>>) -> ()
     }, {
       // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
-      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
-      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
-      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
       // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
       // CHECK-NOT: "tf.TensorListPopBack"
       %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -426,11 +426,11 @@ func.func @main(%arg0: tensor<i32>) -> () {
       "tf.Yield" (%pop#0) : (tensor<!tf_type.variant<tensor<f32>>>) -> ()
     }, {
       // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
-      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
-      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
       // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
-      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
       // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
       // CHECK-NOT: "tf.TensorListPopBack"
       %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>)
@@ -482,7 +482,7 @@ func.func @callee(%arg0: tensor<!tf_type.variant<tensor<f32>>>, %arg1: tensor<i1
 // CHECK: func private @callee_tensorlist_decomposed(%[[ARG0:.*]]: tensor<10xf32>, %[[ARG1:.*]]: tensor<i1>, %[[ARG2:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
 // CHECK-NOT: "tf.TensorListPushBack"
 // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-// CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ARG2]], %[[CONST1]])
 // CHECK-NOT: "tf.TensorListPushBack"
 // CHECK: return %[[UPDATE]], %[[ADD]]
@@ -520,7 +520,7 @@ func.func private @callee(%arg0: tensor<!tf_type.variant<tensor<f32>>>, %arg1: t
 
   // CHECK-NOT: "tf.TensorListPushBack"
   // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
-  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ARG2]], %[[CONST1]])
   // CHECK-NOT: "tf.TensorListPushBack"
   %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf_type.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf_type.variant<tensor<f32>>>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 22de9ee736426a..7605f0360625fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1185,7 +1185,7 @@ func.func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // Test invalid tf.Yield operation (parent should be IfRegion)
 func.func @testInvalidYieldOp(%arg0: f32) -> () {
-  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.CaseRegion, tf.IfRegion, tf.WhileRegion'}}
+  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.CaseRegion, tf.IfRegion, tf.WhileRegion, tf.GeneratorDatasetRegion'}}
   "tf.Yield"(%arg0) : (f32) -> ()
 }
 
@@ -5180,3 +5180,41 @@ func.func @test_xla_call_module_with_invalid_symbol() {
   "tf.XlaCallModule"() {Sout = [], device = "", dim_args_spec = [], function_list = [@undefined_function], module = "", platforms = [], version = 4 : i64} : () -> ()
   func.return
 }
+
+// -----
+
+func.func @init(%arg0: tensor<4xf32>) -> tensor<7xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<7xf32>
+    return %0 : tensor<7xf32>
+}
+
+func.func @next(%arg0: tensor<7xf32>, %arg1: tensor<3xf32>) -> tensor<6xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<6xf32>
+    return %0 : tensor<6xf32>
+}
+
+func.func @finalize(%arg0: tensor<6xf32>, %arg1: tensor<2xf32>) -> tensor<5xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<5xf32>
+    return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: func @testGeneratorDataset
+func.func @testGeneratorDataset(%arg0: tensor<4xf32>,
+                                %arg1: tensor<3xf32>,
+                                %arg2: tensor<!tf_type.resource>,
+                                %arg3: tensor<2xf32>) -> tensor<!tf_type.variant> {
+  %0 = "tf.GeneratorDataset"(%arg0, %arg1, %arg2, %arg3) {
+      device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0",
+      finalize_func = @finalize,
+      init_func = @init,
+      next_func = @next,
+      operandSegmentSizes = array<i32: 1, 2, 1>,
+      output_shapes = [#tf_type.shape<>],
+      output_types = [!tf_type.string],
+      metadata = ""} : (
+              tensor<4xf32>,
+              tensor<3xf32>,
+              tensor<!tf_type.resource>,
+              tensor<2xf32>) -> tensor<!tf_type.variant>
+  return %0 : tensor<!tf_type.variant>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir
index af605e92aed4e8..c6f63db25acc68 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_data_fuse_map_and_batch.mlir
@@ -6,7 +6,7 @@ func.func @fuse_map_and_batch() -> tensor<!tf_type.variant> attributes {tf.entry
   %0 = "tf.Const"() {value = dense<5> : tensor<i64>} : () -> tensor<i64>
   %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
   %2 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
-  // CHECK: %[[NPC:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK: %[[NPC:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}>
   // CHECK: %[[TSLICE:.*]] = "tf.TensorSliceDataset"
   %3 = "tf.TensorSliceDataset"(%2) {device = "", output_shapes = [#tf_type.shape<>], metadata = ""} : (tensor<3xi32>) -> tensor<*x!tf_type.variant>
   // CHECK: "tf.MapAndBatchDataset"(%[[TSLICE]], %[[BSIZE:.*]], %[[NPC]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
index 3dacd11a31b0bc..d0a5a74edc5b14 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_optimize.mlir
@@ -12,7 +12,7 @@ func.func @fuseMulIntoConv2d(%arg0: tensor<1x112x112x3xf32>) -> tensor<1x28x23x2
   // CHECK-SAME: [1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00], [5.000000e+00, 1.200000e+01]
   // CHECK-SAME: [7.000000e+00, 1.600000e+01], [9.000000e+00, 2.000000e+01], [1.100000e+01, 2.400000e+01]
   // CHECK-SAME: [1.300000e+01, 2.800000e+01], [1.500000e+01, 3.200000e+01], [1.700000e+01, 3.600000e+01]
-  // CHECK: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CST]]) {data_format = "NHWC", dilations = [1, 2, 3, 1], explicit_paddings = [], padding = "SAME", strides = [1, 4, 5, 1], use_cudnn_on_gpu = true}
+  // CHECK: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CST]]) <{data_format = "NHWC", dilations = [1, 2, 3, 1], explicit_paddings = [], padding = "SAME", strides = [1, 4, 5, 1], use_cudnn_on_gpu = true}>
   // CHECK: return %[[CONV]] : tensor<1x28x23x2xf32>
 }
 
@@ -26,7 +26,7 @@ func.func @notfuseMulIntoConv2d(%arg0: tensor<1x112x112x3xf32>) -> tensor<1x28x2
 
   func.return %1 : tensor<1x28x23x2xf32>
   // CHECK: %cst_0 = arith.constant dense<3.000000e+00> : tensor<23x2xf32>
-  // CHECK: %0 = "tf.Conv2D"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]}
+  // CHECK: %0 = "tf.Conv2D"(%arg0, %cst) <{data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]}> {T = "tfdtype$DT_FLOAT"}
   // CHECK: %1 = "tf.Mul"(%0, %cst_0) : (tensor<1x28x23x2xf32>, tensor<23x2xf32>) -> tensor<1x28x23x2xf32>
   // CHECK: return %1 : tensor<1x28x23x2xf32>
 }
@@ -40,8 +40,8 @@ func.func @simplifyBroadcastReshape(%arg0: tensor<1x8x1x1x1x1x1x18xbf16>) -> ten
   %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x8x6x1x6x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
   func.return %98 : tensor<8x6x6x18xbf16>
 
-  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<[8, 1, 1, 18]> : tensor<4xi64>} : () -> tensor<4xi64>
-  // CHECK-DAG: %[[CST1:.*]] =  "tf.Const"() {value = dense<[8, 6, 6, 18]> : tensor<4xi64>} : () -> tensor<4xi64>
+  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<[8, 1, 1, 18]> : tensor<4xi64>}> : () -> tensor<4xi64>
+  // CHECK-DAG: %[[CST1:.*]] =  "tf.Const"() <{value = dense<[8, 6, 6, 18]> : tensor<4xi64>}> : () -> tensor<4xi64>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x1x1x18xbf16>
   // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<8x1x1x18xbf16>, tensor<4xi64>) -> tensor<8x6x6x18xbf16>
   // CHECK: return %[[BROADCAST]] : tensor<8x6x6x18xbf16>
@@ -55,8 +55,8 @@ func.func @simplifyBroadcastReshapeExtraDims(%arg0: tensor<1x8x1x1x1x1x1x18xbf16
   %98 = "tf.Reshape"(%97, %cst_2) : (tensor<7x1x8x6x1x6x1x1x18xbf16>, tensor<5xi64>) -> tensor<7x8x6x6x18xbf16>
   func.return %98 : tensor<7x8x6x6x18xbf16>
 
-  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 8, 1, 1, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
-  // CHECK-DAG: %[[CST1:.*]] =  "tf.Const"() {value = dense<[7, 8, 6, 6, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 8, 1, 1, 18]> : tensor<5xi64>}> : () -> tensor<5xi64>
+  // CHECK-DAG: %[[CST1:.*]] =  "tf.Const"() <{value = dense<[7, 8, 6, 6, 18]> : tensor<5xi64>}> : () -> tensor<5xi64>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x8x1x1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x8x1x1x18xbf16>
   // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<1x8x1x1x18xbf16>, tensor<5xi64>) -> tensor<7x8x6x6x18xbf16>
   // CHECK: return %[[BROADCAST]] : tensor<7x8x6x6x18xbf16>
@@ -70,8 +70,8 @@ func.func @simplifyBroadcastReshapeOnes(%arg0: tensor<1x1x1x1x1x1x1x18xbf16>) ->
   %98 = "tf.Reshape"(%97, %cst_2) : (tensor<1x1x6x1x6x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x6x1x6x18xbf16>
   func.return %98 : tensor<1x6x1x6x18xbf16>
 
-  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 1, 1, 1, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
-  // CHECK-DAG: %[[CST1:.*]] = "tf.Const"() {value = dense<[1, 6, 1, 6, 18]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 1, 1, 1, 18]> : tensor<5xi64>}> : () -> tensor<5xi64>
+  // CHECK-DAG: %[[CST1:.*]] = "tf.Const"() <{value = dense<[1, 6, 1, 6, 18]> : tensor<5xi64>}> : () -> tensor<5xi64>
   // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[CST]]) : (tensor<1x1x1x1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x1x1x1x18xbf16>
   // CHECK: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[RESHAPE]], %[[CST1]]) : (tensor<1x1x1x1x18xbf16>, tensor<5xi64>) -> tensor<1x6x1x6x18xbf16>
   // CHECK: return %[[BROADCAST]] : tensor<1x6x1x6x18xbf16>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic_v1.py
index 9b71d46edf5fda..2d620c8df759ce 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic_v1.py
@@ -38,7 +38,7 @@
 # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
 
 # CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) <{{{.*}}}> {device = ""} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
 # CHECK-NEXT: return [[R1]] : tensor<3x3xf32>
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
index b3cd46c2d60517..744f302dac610b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
@@ -35,7 +35,7 @@
 # CHECK:      func @[[init]]
 # CHECK-SAME: tf_saved_model.initializer_type = "init_op"
 # CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
-# CHECK-NEXT: [[R5:%.*]] = "tf.Const"() {device = "", value = dense<[1, 2,
+# CHECK-NEXT: [[R5:%.*]] = "tf.Const"() <{value = dense<[1, 2,
 # CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
 # CHECK-SAME: shared_name = "[[hash_table:.*]]"
 # CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py
index 2f99fae8d8d286..bc8457aab0f63c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py
@@ -38,7 +38,7 @@
 # CHECK-SAME: tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init"]
 # CHECK-SAME: tf_saved_model.initializer_type = "init_op"
 # CHECK-SAME: }
-# CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*dense<.*> : tensor<2xi32>.*}}} : () -> tensor<2xi32>
+# CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*dense<.*> : tensor<2xi32>.*}}}> {{{.*}}} : () -> tensor<2xi32>
 # CHECK: %[[RAND_STD_NORMAL:.*]] = "tf.RandomStandardNormal"(%[[CST_0]])
 # CHECK: "tf.AssignVariableOp"(%[[ARG_0]], %[[RAND_STD_NORMAL]]){{.*}}: (tensor<!tf_type.resource<tensor<1x3xf32>>>, tensor<1x3xf32>) -> ()
 # CHECK: return
@@ -50,7 +50,7 @@
 # CHECK-SAME: -> (tensor<3x3xf32> {tf_saved_model.index_path = ["r"]})
 # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
 # CHECK-NEXT: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[ARG_2]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-# CHECK-NEXT: %[[MATMUL_0:.*]] = "tf.MatMul"(%[[ARG_1]], %[[READ_VAR_0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: %[[MATMUL_0:.*]] = "tf.MatMul"(%[[ARG_1]], %[[READ_VAR_0]]) <{{{.*}}}> {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
 # CHECK-NEXT: return %[[MATMUL_0]] : tensor<3x3xf32>
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/multi_variables_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/multi_variables_v1.py
index 4ce47a3aec61ef..6eb8cbe20b0fbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/multi_variables_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/multi_variables_v1.py
@@ -29,7 +29,7 @@
 
 # CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG0]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
 # CHECK-NEXT: [[R1:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<3x5xf32>>>) -> tensor<3x5xf32>
-# CHECK-NEXT: [[R2:%.*]] = "tf.MatMul"([[R0]], [[R1]]) {{{.*}}} : (tensor<5x3xf32>, tensor<3x5xf32>) -> tensor<5x5xf32>
+# CHECK-NEXT: [[R2:%.*]] = "tf.MatMul"([[R0]], [[R1]]) <{{{.*}}}> {{{.*}}} : (tensor<5x3xf32>, tensor<3x5xf32>) -> tensor<5x5xf32>
 
 
 def Test():
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
index 123a11a1883521..261ff5577eb209 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
@@ -39,7 +39,7 @@
 # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
 
 # CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) <{{{.*}}}> {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
 # CHECK-NEXT: return [[R1]] : tensor<3x3xf32>
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir
index 2638aab86b890b..94e0da360795af 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir
@@ -10,8 +10,8 @@ module @asset attributes {tf_saved_model.semantics} {
 
   // CHECK: func @init()
   func.func @init(%arg0: tensor<!tf_type.string> {tf_saved_model.bound_input = @asset0}, %arg1: tensor<!tf_type.string> {tf_saved_model.bound_input = @asset1}) attributes {tf_saved_model.exported_names = ["init"]} {
-    // CHECK-DAG: %[[ASSET0:.*]] = "tf.Const"() {value = dense<"foo/bar/assets/test0.txt"> : tensor<!tf_type.string>}
-    // CHECK-DAG: %[[ASSET1:.*]] = "tf.Const"() {value = dense<"foo/bar/assets/test1.txt"> : tensor<!tf_type.string>}
+    // CHECK-DAG: %[[ASSET0:.*]] = "tf.Const"() <{value = dense<"foo/bar/assets/test0.txt"> : tensor<!tf_type.string>}>
+    // CHECK-DAG: %[[ASSET1:.*]] = "tf.Const"() <{value = dense<"foo/bar/assets/test1.txt"> : tensor<!tf_type.string>}>
 
     // CHECK: %[[VAR0:.*]] = "tf.VarHandleOp"()
     %0 = "tf.VarHandleOp"() {container = "", shared_name = "var0"} : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_assets.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_assets.mlir
index eb4aed85564dab..982ace4360ae98 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_assets.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_assets.mlir
@@ -11,7 +11,7 @@ module attributes {tf_saved_model.semantics} {
   attributes {tf_saved_model.exported_names = ["f"]} {
     %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
     "tf.InitializeTableFromTextFileV2"(%0, %arg0) {delimiter = "\09", device = "", key_index = -2 : i64, offset = 0 : i64, value_index = -1 : i64, vocab_size = 437 : i64} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
-    // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<"assets/table.txt"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    // CHECK: [[CST:%.+]] = "tf.Const"() <{value = dense<"assets/table.txt"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
     // CHECK: [[HASHTABLE:%.+]] = "tf.HashTableV2"()
     // CHECK: "tf.InitializeTableFromTextFileV2"([[HASHTABLE]], [[CST]])
     func.return
@@ -69,8 +69,8 @@ module attributes {tf_saved_model.semantics} {
     "tf.InitializeTableFromTextFileV2"(%0, %arg0) {delimiter = "\09", device = "", key_index = -2 : i64, offset = 0 : i64, value_index = -1 : i64, vocab_size = 437 : i64} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
     %1 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
     "tf.InitializeTableFromTextFileV2"(%1, %arg1) {delimiter = "\09", device = "", key_index = -2 : i64, offset = 0 : i64, value_index = -1 : i64, vocab_size = 437 : i64} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
-    // CHECK-DAG: [[CST_1:%.+]] = "tf.Const"() {value = dense<"assets/table2.txt"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-    // CHECK-DAG: [[CST:%.+]] = "tf.Const"() {value = dense<"assets/table.txt"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    // CHECK-DAG: [[CST_1:%.+]] = "tf.Const"() <{value = dense<"assets/table2.txt"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    // CHECK-DAG: [[CST:%.+]] = "tf.Const"() <{value = dense<"assets/table.txt"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
     // CHECK: [[HASHTABLE:%.+]] = "tf.HashTableV2"()
     // CHECK: "tf.InitializeTableFromTextFileV2"([[HASHTABLE]], [[CST]])
     // CHECK: [[HASHTABLE_1:%.+]] = "tf.HashTableV2"()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 67a5439524f8e3..9c0f9b2eddb29b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -12,7 +12,7 @@ module attributes {tf_saved_model.semantics} {
   func.func @f(%arg0: tensor<!tf_type.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
-    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK: "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     func.return
   }
 }
@@ -67,7 +67,7 @@ module attributes {tf_saved_model.semantics} {
   }
 
   func.func private @f_callee(%arg0: tensor<!tf_type.resource<tensor<f32>>>) {
-    // CHECK: "tf.Const"() {value = dense<2.100000e+01> : tensor<f32>}
+    // CHECK: "tf.Const"() <{value = dense<2.100000e+01> : tensor<f32>}>
     func.return
   }
 }
@@ -90,7 +90,7 @@ module attributes {tf_saved_model.semantics} {
 
   func.func private @g_callee(%arg0: tensor<!tf_type.resource<tensor<f32>>>) {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
-    // CHECK: "tf.Const"() {value = dense<3.200000e+01> : tensor<f32>}
+    // CHECK: "tf.Const"() <{value = dense<3.200000e+01> : tensor<f32>}>
     func.return
   }
 }
@@ -146,10 +146,10 @@ module attributes {tf_saved_model.semantics} {
 
   func.func @f(%arg1: tensor<!tf_type.resource<tensor<f32>>> {tf_saved_model.bound_input = @"v"}, %arg2: tensor<!tf_type.resource<tensor<f32>>> {tf_saved_model.bound_input = @"v2"})
   attributes {tf_saved_model.exported_names = ["f"]} {
-    // CHECK-DAG: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK-DAG: "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}>
     %0 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
 
-    // CHECK-DAG: "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>}
+    // CHECK-DAG: "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}>
     %1 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
     func.return
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_mark_initialized_variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_mark_initialized_variables.mlir
index 51368f29fa76bd..4f6967990eb162 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_mark_initialized_variables.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_mark_initialized_variables.mlir
@@ -15,14 +15,14 @@ module attributes {tf_saved_model.semantics, tf_saved_model.under_construction}
     func.return %4 : tensor<100x50xf32>
   }
   // CHECK: "tf.VarHandleOp"
-  // CHECK-SAME: _is_initialized = true
   // CHECK-SAME: shared_name = "var1"
-  // CHECK: "tf.VarHandleOp"
   // CHECK-SAME: _is_initialized = true
+  // CHECK: "tf.VarHandleOp"
   // CHECK-SAME: shared_name = "var2"
+  // CHECK-SAME: _is_initialized = true
   // CHECK: "tf.VarHandleOp"
-  // CHECK-SAME: _is_initialized = false
   // CHECK-SAME: shared_name = "var3"
+  // CHECK-SAME: _is_initialized = false
 
   // INVALID-NOT: _is_initialized
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
index 6d655ab4ffc95a..553bfa0955106b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
@@ -14,8 +14,8 @@ func.func @non_replicated(%arg0: tensor<*x!tf_type.resource> {tf.device = "/devi
       mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 1 : i64, is_output = false}>
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
     : (tensor<*x!tf_type.resource>) -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
@@ -249,8 +249,8 @@ func.func @replicated(%arg0: tensor<*x!tf_type.resource> {tf.device = "/device:C
       mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 1 : i64, is_output = false}>
   // CHECK: %[[ITER1:.*]]:2 = "tf.IteratorGetNext"
   %3:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
     : (tensor<*x!tf_type.resource>) -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
@@ -294,8 +294,8 @@ func.func @replicated_packed(%arg0: tensor<*x!tf_type.resource> {tf.device = "/d
       mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 1 : i64, is_output = false}>
 
   // CHECK-DAG: %[[COPY0:.*]] = "tf.TPUCopyWithLayout"(%[[ITER0]]#0, %[[LAYOUT0]]) {device = "/device:TPU:0"}
   // CHECK-DAG: %[[COPY1:.*]] = "tf.TPUCopyWithLayout"(%[[ITER0]]#1, %[[LAYOUT1]]) {device = "/device:TPU:0"}
@@ -332,8 +332,8 @@ func.func @replicated(%arg0: tensor<*x!tf_type.resource> {tf.device = "/device:C
       mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 1 : i64, is_output = false}>
   // CHECK: %[[ITER1:.*]] = "tf.IteratorGetNext"
   %3 = "tf.IteratorGetNext"(%arg1) {device = "/device:CPU:0"}
     : (tensor<*x!tf_type.resource>) -> tensor<3x3x1x32xf32>
@@ -415,8 +415,8 @@ func.func @parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.device = "/de
     %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \02", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1, %1#2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) <{index = 0 : i64, is_output = false}>
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> (tensor<128xf32>, tensor<128xf32>)
   // CHECK: "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
@@ -481,8 +481,8 @@ func.func @replicated_parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.de
     %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\02 \02", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>)
     tf_device.return %1#0, %1#1, %1#2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>)
-  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
-  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
+  // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) <{index = 0 : i64, is_output = false}>
+  // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) <{index = 0 : i64, is_output = false}>
   // CHECK-DAG: %[[ITER0:.*]]:2 = "tf.IteratorGetNext"(%[[ARG0]])
   // CHECK-DAG: %[[ITER1:.*]]:2 = "tf.IteratorGetNext"(%[[ARG1]])
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> (tensor<128xf32>, tensor<128xf32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-multiple-while-body-func.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-multiple-while-body-func.mlir
index 1762e9f7889fd4..9746ca6919a8fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-multiple-while-body-func.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-multiple-while-body-func.mlir
@@ -1,5 +1,5 @@
 // RUN: tf-opt %s -tf-tpu-bridge 2>&1 | FileCheck %s
-// RUN: tf-opt %s -tf-cluster-tpu-bridge-v1 2>&1 | FileCheck %s
+// RUN: tf-opt %s -tf-cluster-tpu-bridge-v1 -tfrt-lower-cluster-to-runtime-ops-tpu -tf-dialect-to-executor-v1 2>&1 | FileCheck %s
 
 // This test verifies there is no warning about shape inference failure in TPU
 // bridge in handling multiple usage of the same function.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index ed4375091167a4..1d3c1b6f3cf518 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -942,7 +942,7 @@ func.func @missing_metadata() {
 
 // CHECK-LABEL: func @const_with_attrs
 func.func @const_with_attrs(%arg0: tensor<*xi32>, %arg1: tensor<?xi64>) -> (tensor<?xi32>, tensor<?xi64>) {
-  // CHECK: %{{[a-z0-9_]*}} = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %{{[a-z0-9_]*}} = "tf.Const"() <{value = dense<-1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK-NEXT: %{{[a-z0-9_]*}} = "tf.Reshape"(%arg0
   // CHECK-NEXT: %{{.*}} = "tf_device.cluster"() ({
   %minus_one = "tf.Const"() {_replication_info = "cluster",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
index 2125f2877fb8cf..4d290b71fca1c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
@@ -15,7 +15,7 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"
     %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     // CHECK: %[[INPUT:.*]] = "tf.IteratorGetNext"
     %1 = "tf.IteratorGetNext"(%arg5) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf_type.resource>) -> tensor<2x224x224x3xf32>
-    // CHECK-DAG: %[[SPACETODEPTH0:.*]] = "tf.SpaceToDepth"([[INPUT:.*]]) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    // CHECK-DAG: %[[SPACETODEPTH0:.*]] = "tf.SpaceToDepth"([[INPUT:.*]]) <{block_size = 2 : i64, data_format = "NHWC"}> : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
     %2 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %3 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf_type.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
     %4 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
@@ -61,18 +61,18 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME: strides = [1, 1, 1, 1]
     // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<4x4x12x64xf32>
     %7 = "tf.Conv2DBackpropFilter"(%5, %2, %6) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<7x7x3x64xf32>
-    // CHECK: %[[CONST0:.*]] = "tf.Const"() {value = dense<
+    // CHECK: %[[CONST0:.*]] = "tf.Const"() <{value = dense<
     // CHECK-SAME: [4, 4, 2, 2, 3, 64]
     // CHECK: %[[RESHAPE0:.*]] = "tf.Reshape"(%[[BACKPROP:.*]], %[[CONST0:.*]]) : (tensor<4x4x12x64xf32>, tensor<6xi64>) -> tensor<4x4x2x2x3x64xf32>
-    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() <{value = dense<
     // CHECK-SAME: [0, 2, 1, 3, 4, 5]
     // CHECK: %[[TRANSPOSE0:.*]] = "tf.Transpose"(%[[RESHAPE0:.*]], %[[CONST1:.*]]) : (tensor<4x4x2x2x3x64xf32>, tensor<6xi32>) -> tensor<4x2x4x2x3x64xf32>
-    // CHECK: %[[CONST2:.*]] = "tf.Const"() {value = dense<
+    // CHECK: %[[CONST2:.*]] = "tf.Const"() <{value = dense<
     // CHECK-SAME: [8, 8, 3, 64]
     // CHECK: %[[RESHAPE1:.*]] = "tf.Reshape"(%[[TRANSPOSE1:.*]], %[[CONST2:.*]]) : (tensor<4x2x4x2x3x64xf32>, tensor<4xi64>) -> tensor<8x8x3x64xf32>
-    // CHECK: %[[CONST3:.*]] = "tf.Const"() {value = dense<
+    // CHECK: %[[CONST3:.*]] = "tf.Const"() <{value = dense<
     // CHECK-SAME: [7, 7, 3, 64]
-    // CHECK: %[[CONST4:.*]] = "tf.Const"() {value = dense<
+    // CHECK: %[[CONST4:.*]] = "tf.Const"() <{value = dense<
     // CHECK-SAME: 0
     // CHECK: %[[SLICE0:.*]] = "tf.Slice"(%[[RESHAPE1:.*]], %[[CONST4:.*]], %[[CONST3:.*]]) : (tensor<8x8x3x64xf32>, tensor<4xi64>, tensor<4xi32>) -> tensor<7x7x3x64xf32>
     %8 = "tf.CrossReplicaSum"(%7, %1) : (tensor<7x7x3x64xf32>, tensor<1x1xi32>) -> tensor<7x7x3x64xf32>
@@ -90,10 +90,10 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"
 module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSITE:0" = {}, "/job:localhost/replica:0/task:0/device:CPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:1" = {}, "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 458 : i32}} {
   func.func @main(%arg0: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg1: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg2: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg3: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg4: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg5: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg7: tensor<*x!tf_type.resource<tensor<64x1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg8: tensor<*x!tf_type.resource<tensor<1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg9: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg10: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg11: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg12: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}) attributes {tf.entry_function = {control_outputs = "IteratorGetNext,IteratorGetNext_1,CrossReplicaSum,AssignAddVariableOp,CrossReplicaSum_1,AssignAddVariableOp_1,CrossReplicaSum_2,AssignAddVariableOp_2,CrossReplicaSum_3,AssignAddVariableOp_3", inputs = "iterator,iterator_1,iterator_2,iterator_3,iterator_4,iterator_5,resnet50_conv1_conv2d_conv1_kernel_140365606309224_handle_inputs_0,resnet50_fc1000_matmul_fc1000_kernel_140365944145960_handle_inputs_0,resnet50_fc1000_biasadd_fc1000_bias_140365944146240_handle_inputs_0,total_140366323758976_handle_inputs_0,count_140366323759312_handle_inputs_0,total_140366323760264_handle_inputs_0,count_140366323760600_handle_inputs_0", outputs = ""}} {
     // CHECK: %[[INPUT00:.*]] = "tf.IteratorGetNext"
-    // CHECK-DAG: %[[SPACETODEPTH00:.*]] = "tf.SpaceToDepth"([[INPUT00:.*]]#0) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    // CHECK-DAG: %[[SPACETODEPTH00:.*]] = "tf.SpaceToDepth"([[INPUT00:.*]]#0) <{block_size = 2 : i64, data_format = "NHWC"}> : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
     %0:2 = "tf.IteratorGetNext"(%arg2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> (tensor<2x224x224x3xf32>, tensor<2x1xf32>)
     // CHECK: %[[INPUT01:.*]] = "tf.IteratorGetNext"
-    // CHECK-DAG: %[[SPACETODEPTH01:.*]] = "tf.SpaceToDepth"([[INPUT01:.*]]#0) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    // CHECK-DAG: %[[SPACETODEPTH01:.*]] = "tf.SpaceToDepth"([[INPUT01:.*]]#0) <{block_size = 2 : i64, data_format = "NHWC"}> : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
     %1:2 = "tf.IteratorGetNext"(%arg4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> (tensor<2x224x224x3xf32>, tensor<2x1xf32>)
     tf_device.replicate([%0#0, %1#0] as %arg13: tensor<2x224x224x3xf32>, [%0#1, %1#1] as %arg14: tensor<2x1xf32>, %arg6 as %arg15: tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>>, %arg8 as %arg16: tensor<*x!tf_type.resource<tensor<1001xf32>>>, %arg7 as %arg17: tensor<*x!tf_type.resource<tensor<64x1001xf32>>>, %arg9 as %arg18: tensor<*x!tf_type.resource<tensor<f32>>>, %arg10 as %arg19: tensor<*x!tf_type.resource<tensor<f32>>>, %arg11 as %arg20: tensor<*x!tf_type.resource<tensor<f32>>>, %arg12 as %arg21: tensor<*x!tf_type.resource<tensor<f32>>>) {_mirrored_variable_indices = [2, 3, 4, 5, 6, 7, 8], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
       %2 = "tf.ReadVariableOp"(%arg15) : (tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
@@ -167,7 +167,7 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSI
 module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSITE:0" = {}, "/job:localhost/replica:0/task:0/device:CPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:1" = {}, "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 458 : i32}} {
   func.func @main(%arg0: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg1: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg2: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg3: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg4: tensor<*x!tf_type.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg5: tensor<!tf_type.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg7: tensor<*x!tf_type.resource<tensor<64x1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg8: tensor<*x!tf_type.resource<tensor<1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg9: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg10: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg11: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg12: tensor<*x!tf_type.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}) attributes {tf.entry_function = {control_outputs = "IteratorGetNext,IteratorGetNext_1,CrossReplicaSum,AssignAddVariableOp,CrossReplicaSum_1,AssignAddVariableOp_1,CrossReplicaSum_2,AssignAddVariableOp_2,CrossReplicaSum_3,AssignAddVariableOp_3", inputs = "iterator,iterator_1,iterator_2,iterator_3,iterator_4,iterator_5,resnet50_conv1_conv2d_conv1_kernel_140365606309224_handle_inputs_0,resnet50_fc1000_matmul_fc1000_kernel_140365944145960_handle_inputs_0,resnet50_fc1000_biasadd_fc1000_bias_140365944146240_handle_inputs_0,total_140366323758976_handle_inputs_0,count_140366323759312_handle_inputs_0,total_140366323760264_handle_inputs_0,count_140366323760600_handle_inputs_0", outputs = ""}} {
     // CHECK: %[[INPUT00:.*]] = "tf.IteratorGetNext"
-    // CHECK-DAG: %[[SPACETODEPTH00:.*]] = "tf.SpaceToDepth"([[INPUT00:.*]]#0) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    // CHECK-DAG: %[[SPACETODEPTH00:.*]] = "tf.SpaceToDepth"([[INPUT00:.*]]#0) <{block_size = 2 : i64, data_format = "NHWC"}> : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
     %0:2 = "tf.IteratorGetNext"(%arg2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> (tensor<2x224x224x3xf32>, tensor<2x1xf32>)
     tf_device.replicate(%0#0 as %arg13: tensor<2x224x224x3xf32>, %0#1 as %arg14: tensor<2x1xf32>, %arg6 as %arg15: tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>>, %arg8 as %arg16: tensor<*x!tf_type.resource<tensor<1001xf32>>>, %arg7 as %arg17: tensor<*x!tf_type.resource<tensor<64x1001xf32>>>, %arg9 as %arg18: tensor<*x!tf_type.resource<tensor<f32>>>, %arg10 as %arg19: tensor<*x!tf_type.resource<tensor<f32>>>, %arg11 as %arg20: tensor<*x!tf_type.resource<tensor<f32>>>, %arg12 as %arg21: tensor<*x!tf_type.resource<tensor<f32>>>) {_mirrored_variable_indices = [2, 3, 4, 5, 6, 7, 8], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
       %2 = "tf.ReadVariableOp"(%arg15) : (tensor<*x!tf_type.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
index d3276c28394408..d1437a6e5a20ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_update_embedding_enqueue_op_inputs.mlir
@@ -15,7 +15,7 @@ func.func @check_enqueue_ops_update_for_eval(%arg0: tensor<?x2xi32>, %arg1: tens
   // CHECK: %[[CONST_0:.*]] = "tf.Const"()
   %0 = "tf.Const"() {value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
 
-  // CHECK: %[[CONST_MODE:.*]] = "tf.Const"() {_xla_outside_compilation = "0", value = dense<"inference"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+  // CHECK: %[[CONST_MODE:.*]] = "tf.Const"() <{value = dense<"inference"> : tensor<!tf_type.string>}> {_xla_outside_compilation = "0"} : () -> tensor<!tf_type.string>
   // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_MODE]])
   "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %arg7) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf_type.string>) -> ()
   %2:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
@@ -43,7 +43,7 @@ func.func @check_enqueue_ops_update_for_training(%arg0: tensor<?x2xi32>, %arg1:
   %3 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
   "tf.SendTPUEmbeddingGradients"(%2, %3) {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D", operandSegmentSizes = array<i32: 2, 0>} : (tensor<2x2xf32>, tensor<4x4xf32>) -> ()
 
-  // CHECK: %[[CONST_MODE:.*]] = "tf.Const"() {_xla_outside_compilation = "0", value = dense<"train"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+  // CHECK: %[[CONST_MODE:.*]] = "tf.Const"() <{value = dense<"train"> : tensor<!tf_type.string>}> {_xla_outside_compilation = "0"} : () -> tensor<!tf_type.string>
   // CHECK: "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ARG_4]], %[[ARG_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_MODE]])
   "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %0, %0, %arg7) {_tpu_embedding_layer = "call1", _xla_outside_compilation = "0", combiners = ["mean", "sum"], device_ordinal = -1 : i64, max_sequence_lengths = [0, 0, 0], table_ids = [1, 1, 0]} : (tensor<?x2xi32>, tensor<?x2xi32>, tensor<?x2xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<0xf32>, tensor<0xf32>, tensor<0xf32>, tensor<!tf_type.string>) -> ()
   %4:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
index ec2d36d126775b..4333e79e0ee430 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
@@ -7,12 +7,12 @@ func.func @batchMatMulTwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6xf
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulTwoDim
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 4, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 6]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 4, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 6]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x4x5xf32>, tensor<3xi64>) -> tensor<6x4x5xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
@@ -32,14 +32,14 @@ func.func @batchMatMulTwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6xf
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -51,12 +51,12 @@ func.func @batchMatMulTwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x3x
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulTwoDimAdjXY
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 4]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 6, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 4]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 4]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 6, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 4]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x5x4xf32>, tensor<3xi64>) -> tensor<6x5x4xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x5x4xf32>) -> (tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>)
@@ -76,14 +76,14 @@ func.func @batchMatMulTwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x3x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -95,9 +95,9 @@ func.func @batchMatMulOneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>)
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulOneDim
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_RESHAPED]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
@@ -109,11 +109,11 @@ func.func @batchMatMulOneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>)
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -124,16 +124,16 @@ func.func @batchMatMulSingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x6x
   func.return %0 : tensor<1x4x6xf32>
 
   // CHECK-LABEL: batchMatMulSingleBatch
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%arg0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) {axis = 0 : i64} : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
 }
 
@@ -144,19 +144,19 @@ func.func @batchMatMulUnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x6x
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulUnbatchedLeft
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[RHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg1) : (tensor<i32>, tensor<3x5x6xf32>) -> (tensor<1x5x6xf32>, tensor<1x5x6xf32>, tensor<1x5x6xf32>)
   // CHECK: %[[RHS_1:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#0, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -167,19 +167,19 @@ func.func @batchMatMulUnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulUnbatchedRight
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_SPLIT:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -190,7 +190,7 @@ func.func @batchMatMulMatrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) ->
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulMatrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -201,7 +201,7 @@ func.func @batchMatMulMatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf32>
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulMatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -213,12 +213,12 @@ func.func @batchMatMulV2TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2TwoDim
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 4, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 6]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 4, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 6]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x4x5xf32>, tensor<3xi64>) -> tensor<6x4x5xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
@@ -238,14 +238,14 @@ func.func @batchMatMulV2TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -257,12 +257,12 @@ func.func @batchMatMulV2TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2TwoDimAdjXY
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 4]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 6, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 4]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 4]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 6, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 4]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x5x4xf32>, tensor<3xi64>) -> tensor<6x5x4xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x5x4xf32>) -> (tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>)
@@ -282,14 +282,14 @@ func.func @batchMatMulV2TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -301,12 +301,12 @@ func.func @batchMatMulV2Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2Broadcast
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 4, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[3, 5, 6]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 4, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[3, 5, 6]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x1x4x5xf32>, tensor<3xi64>) -> tensor<2x4x5xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:2 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<2x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>)
@@ -319,14 +319,14 @@ func.func @batchMatMulV2Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -338,9 +338,9 @@ func.func @batchMatMulV2OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2OneDim
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_RESHAPED]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
@@ -352,11 +352,11 @@ func.func @batchMatMulV2OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -367,16 +367,16 @@ func.func @batchMatMulV2SingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x
   func.return %0 : tensor<1x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2SingleBatch
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%arg0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) {axis = 0 : i64} : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
 }
 
@@ -387,19 +387,19 @@ func.func @batchMatMulV2UnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2UnbatchedLeft
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[RHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg1) : (tensor<i32>, tensor<3x5x6xf32>) -> (tensor<1x5x6xf32>, tensor<1x5x6xf32>, tensor<1x5x6xf32>)
   // CHECK: %[[RHS_1:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#0, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -410,19 +410,19 @@ func.func @batchMatMulV2UnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2UnbatchedRight
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_SPLIT:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -433,7 +433,7 @@ func.func @batchMatMulV2Matrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2Matrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -444,7 +444,7 @@ func.func @batchMatMulV2MatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf3
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2MatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -455,7 +455,7 @@ func.func @batchMatMulV2DynamicSize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x4xf3
   func.return %0 : tensor<?x4xf32>
 
   // CHECK-LABEL: batchMatMulV2DynamicSize
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<?x4xf32>
 }
 
@@ -467,12 +467,12 @@ func.func @batchMatMulV3TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3TwoDim
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 4, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 6]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 4, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 6]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x4x5xf32>, tensor<3xi64>) -> tensor<6x4x5xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
@@ -492,14 +492,14 @@ func.func @batchMatMulV3TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -511,12 +511,12 @@ func.func @batchMatMulV3TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3TwoDimAdjXY
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5, 4]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 6, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 4]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[6, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5, 4]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 6, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 4]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[6, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x3x5x4xf32>, tensor<3xi64>) -> tensor<6x5x4xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:6 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<6x5x4xf32>) -> (tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>, tensor<1x5x4xf32>)
@@ -536,14 +536,14 @@ func.func @batchMatMulV3TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -555,12 +555,12 @@ func.func @batchMatMulV3Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   func.return %0 : tensor<2x3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3Broadcast
-  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 4, 5]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() {value = dense<[3, 5, 6]> : tensor<3xi64>}
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
-  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() {value = dense<[2, 3, 4, 6]> : tensor<4xi64>}
+  // CHECK-DAG: %[[LHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 4, 5]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[RHS_RESHAPED_SHAPE:.*]] = "tf.Const"() <{value = dense<[3, 5, 6]> : tensor<3xi64>}>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[RESULT_SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 3, 4, 6]> : tensor<4xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]] = "tf.Reshape"(%arg0, %[[LHS_RESHAPED_SHAPE]]) : (tensor<2x1x4x5xf32>, tensor<3xi64>) -> tensor<2x4x5xf32>
   // CHECK: %[[LHS_SPLIT:.*]]:2 = "tf.Split"(%[[SPLITTING_AXIS]], %[[LHS_RESHAPED]]) : (tensor<i32>, tensor<2x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>)
@@ -573,14 +573,14 @@ func.func @batchMatMulV3Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
   // CHECK: return %[[RESULT]] : tensor<2x3x4x6xf32>
 }
@@ -592,9 +592,9 @@ func.func @batchMatMulV3OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3OneDim
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[LHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_RESHAPED]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
@@ -606,11 +606,11 @@ func.func @batchMatMulV3OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -621,16 +621,16 @@ func.func @batchMatMulV3SingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x
   func.return %0 : tensor<1x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3SingleBatch
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%arg0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) {axis = 0 : i64} : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
 }
 
@@ -641,19 +641,19 @@ func.func @batchMatMulV3UnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3UnbatchedLeft
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
-  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() {value = dense<[5, 6]> : tensor<2xi64>}
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG: %[[MATMUL_RHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[5, 6]> : tensor<2xi64>}>
 
   // CHECK: %[[RHS_RESHAPED:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg1) : (tensor<i32>, tensor<3x5x6xf32>) -> (tensor<1x5x6xf32>, tensor<1x5x6xf32>, tensor<1x5x6xf32>)
   // CHECK: %[[RHS_1:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#0, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -664,19 +664,19 @@ func.func @batchMatMulV3UnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5
   func.return %0 : tensor<3x4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3UnbatchedRight
-  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-DAG: %[[SPLITTING_AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG: %[[MATMUL_LHS_SHAPE:.*]] = "tf.Const"() <{value = dense<[4, 5]> : tensor<2xi64>}> : () -> tensor<2xi64>
 
   // CHECK: %[[LHS_SPLIT:.*]]:3 = "tf.Split"(%[[SPLITTING_AXIS]], %arg0) : (tensor<i32>, tensor<3x4x5xf32>) -> (tensor<1x4x5xf32>, tensor<1x4x5xf32>, tensor<1x4x5xf32>)
   // CHECK: %[[LHS_1:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#0, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
-  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) {axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
 }
 
@@ -687,7 +687,7 @@ func.func @batchMatMulV3Matrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3Matrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -698,7 +698,7 @@ func.func @batchMatMulV3MatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf3
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3MatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = true, transpose_b = true} : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir b/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
index 263a6762238823..09931d9fc2adb6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
@@ -80,7 +80,7 @@ func.func @incorrect_control_deps_replaced_with_correct_controls(%arg0: tensor<*
 // CHECK:   %[[GRAPH:.*]]:2 = tf_executor.graph {
 // CHECK:     %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
 // CHECK:     %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1)
-// CHECK:     %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) {message = "add2 result"}
+// CHECK:     %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) <{message = "add2 result"}>
 // CHECK:     tf_executor.fetch %[[ADD1]], %[[ADD2]], %[[PRINT_control]] :
 // CHECK:   }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
@@ -99,7 +99,7 @@ func.func @trailing_print(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*
 // CHECK:   %[[GRAPH:.*]]:2 = tf_executor.graph {
 // CHECK:     %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
 // CHECK:     %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1)
-// CHECK:     %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) {message = "add2 result"}
+// CHECK:     %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) <{message = "add2 result"}>
 // CHECK:     tf_executor.fetch %[[ADD1]], %[[ADD2]], %[[PRINT_control]] :
 // CHECK:   }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
@@ -127,7 +127,7 @@ func.func @non_aliasing_reads_writes(
 // CHECK-DAG:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg0)
 // CHECK-DAG:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %arg2)
 // CHECK-DAG:   %[[READ1:.*]], %[[READ1_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg1)
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
 // CHECK:   %[[READ2:.*]], %[[READ2_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN1_CONTROL:.*]] = tf_executor.island(%[[READ1_CONTROL]]) wraps "tf.AssignVariableOp"(%arg1, %[[READ0]])
 // CHECK:   %[[ASSIGN2_CONTROL:.*]] = tf_executor.island(%[[ASSIGN0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %[[READ2]])
@@ -151,8 +151,8 @@ func.func @unknown_side_effecting_op(%arg0: tensor<32xf32>) {
 }
 // CHECK-LABEL: func @unknown_side_effecting_op
 // CHECK: tf_executor.graph {
-// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"}
-// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v1"}
+// CHECK:   %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v0"}>
+// CHECK:   %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{container = "c", shared_name = "v1"}>
 // CHECK:   %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]])
 // CHECK:   %[[ASSIGN0_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VH1]], %arg0)
 // CHECK:   %[[UNKNOWN_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]], %[[ASSIGN0_CONTROL]]) wraps "tf._UnknownSideEffectingOp_"()
@@ -170,7 +170,7 @@ func.func @single_op_island_forward_block_arg(%arg0: tensor<?x?x?x?xbf16>) -> (t
 }
 // CHECK-LABEL: func @single_op_island_forward_block_arg
 // CHECK: tf_executor.graph {
-// CHECK:   %[[outputs:.*]], %[[control:.*]] = tf_executor.island wraps "tf.Const"() {value = dense<0.000000e+00> : tensor<2048xf32>} : () -> tensor<2048xf32>
+// CHECK:   %[[outputs:.*]], %[[control:.*]] = tf_executor.island wraps "tf.Const"() <{value = dense<0.000000e+00> : tensor<2048xf32>}> : () -> tensor<2048xf32>
 // CHECK:   tf_executor.fetch %[[outputs]], %arg0 : tensor<2048xf32>, tensor<?x?x?x?xbf16>
 
 func.func @tpu_load_embedding_ops_sink_controls(%arg0: tensor<*x!tf_type.resource<tensor<8xf32>>>, %arg1: tensor<*x!tf_type.resource<tensor<8xf32>>>, %arg2: tensor<*x!tf_type.resource<tensor<8xf32>>>, %arg3: tensor<*x!tf_type.resource<tensor<8xf32>>>) {
@@ -194,13 +194,13 @@ func.func @tpu_load_embedding_ops_sink_controls(%arg0: tensor<*x!tf_type.resourc
 // CHECK:    %[[outputs:.*]], %[[control:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<*x!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
 // CHECK:    %[[outputs_0:.*]], %[[control_1:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<*x!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
 // CHECK:    %[[outputs_2:.*]], %[[control_3:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
-// CHECK:    %[[control_4:.*]] = tf_executor.island wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs]], %[[outputs_0]]) {config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table1"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+// CHECK:    %[[control_4:.*]] = tf_executor.island wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs]], %[[outputs_0]]) <{config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table1"}> : (tensor<8xf32>, tensor<8xf32>) -> ()
 // CHECK:    %[[outputs_5:.*]], %[[control_6:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg3) {device = ""} : (tensor<*x!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
-// CHECK:    %[[control_7:.*]] = tf_executor.island wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs_2]], %[[outputs_5]]) {config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table2"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+// CHECK:    %[[control_7:.*]] = tf_executor.island wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs_2]], %[[outputs_5]]) <{config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table2"}> : (tensor<8xf32>, tensor<8xf32>) -> ()
 // CHECK:    %[[control_8:.*]] = tf_executor.island(%[[control]], %[[control_1]], %[[control_3]], %[[control_4]], %[[control_6]], %[[control_7]]) wraps "tf.UnknownOp"() : () -> ()
 // CHECK:    %[[control_9:.*]] = tf_executor.island(%[[control_8]]) wraps "tf.UnknownOp"() : () -> ()
-// CHECK:    %[[control_10:.*]] = tf_executor.island(%[[control_9]]) wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs]], %[[outputs_0]]) {config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table3"} : (tensor<8xf32>, tensor<8xf32>) -> ()
-// CHECK:    %[[control_11:.*]] = tf_executor.island(%[[control_9]]) wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs_2]], %[[outputs_5]]) {config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table4"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+// CHECK:    %[[control_10:.*]] = tf_executor.island(%[[control_9]]) wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs]], %[[outputs_0]]) <{config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table3"}> : (tensor<8xf32>, tensor<8xf32>) -> ()
+// CHECK:    %[[control_11:.*]] = tf_executor.island(%[[control_9]]) wraps "tf.LoadTPUEmbeddingAdagradParameters"(%[[outputs_2]], %[[outputs_5]]) <{config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table4"}> : (tensor<8xf32>, tensor<8xf32>) -> ()
 // CHECK:    tf_executor.fetch %[[control_10]], %[[control_11]] : !tf_executor.control, !tf_executor.control
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir
index 982b44fd86ba9f..82f999c46261a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir
@@ -18,9 +18,9 @@ module {
   // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi32>, %[[ARG1:.*]]: tensor<10xi32>
   func.func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
     // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
-    // CHECK-SAME:   _entry_function = @main0,
     // CHECK-NOT:    function_list
     // CHECK-SAME:   module = ""
+    // CHECK-SAME:   _entry_function = @main0,
 
     // `module` is stablehlo bytecode for:
     //  func.func @main(%arg0: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %arg1: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) {
@@ -36,9 +36,9 @@ module {
   // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi32>, %[[ARG1:.*]]: tensor<10xi32>
   func.func @foo(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
     // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
-    // CHECK-SAME:   _entry_function = @main1,
     // CHECK-NOT:    function_list
     // CHECK-SAME:   module = ""
+    // CHECK-SAME:   _entry_function = @main1,
 
     // `module` is stablehlo bytecode for:
     //  func.func @main(%arg0: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %arg1: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir
index 83be1fdbe42285..f19d295c0b33ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir
@@ -13,12 +13,12 @@ module {
   func.func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
     // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
     // CHECK-SAME:   Sout = [#tf_type.shape<?>]
-    // CHECK-SAME:   _entry_function = @main0
-    // CHECK-SAME:   _stablehlo_module_attrs = {}
     // CHECK-NOT:    function_list
     // CHECK-SAME:   module = ""
     // CHECK-SAME:   platforms = []
     // CHECK-SAME:   version = 5
+    // CHECK-SAME:   _entry_function = @main0
+    // CHECK-SAME:   _stablehlo_module_attrs = {}
 
     %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<?>], dim_args_spec = [], _entry_function = @main0, module = "", platforms = [], version = 5 : i64} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
     // CHECK: return %[[RESULT]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
index 282588a4bd2952..6b0f700c65eb36 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
@@ -91,8 +91,8 @@ func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
 // Check that we encapsulate the function body of entry functions with compilation markers, and not the included partitioned calls with the markers.
 // CHECK-LABEL:   func.func @entry_function_with_compilation_markers(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "GPU", allow_soft_placement = true, device = "/device:GPU:0", tf.entry_function = {}} {
 // CHECK:           %0 = "tf_device.cluster"() ({
-// CHECK:             %1 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "GPU", config = "", config_proto = "", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> tensor<i32>
-// CHECK:             %cst = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+// CHECK:             %1 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @stateful_pcall_func}> {_xla_compile_device_type = "GPU"} : (tensor<i32>) -> tensor<i32>
+// CHECK:             %cst = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:             %2 = "tf.Add"(%1, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:             tf_device.return %2 : tensor<i32>
 // CHECK:           }) {_cluster_outlined_function_name = "entry_function_with_compilation_markers_cluster_func", _xla_compile_device_type = "GPU", allow_soft_placement = true, device = "/device:GPU:0"} : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
index 914c753c1ad447..4e0284816a7644 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
@@ -4,7 +4,7 @@
 module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
   // CHECK-LABEL: func.func @convert_cluster_func
   func.func @convert_cluster_func(%arg0: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf.XlaLaunch"(%arg0) {function = @func, operandSegmentSizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
+    // CHECK: "tf.XlaLaunch"(%arg0) <{function = @func, operandSegmentSizes = array<i32: 0, 1, 0>}> : (tensor<i32>) -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg0) {func = @func} : (tensor<i32>) -> tensor<i32>
     func.return %0 : tensor<i32>
   }
@@ -19,7 +19,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources_in_order
   func.func @convert_cluster_func_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources_in_order, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) <{function = @func_with_resources_in_order, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg1, %arg0) {func = @func_with_resources_in_order} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
     func.return %0 : tensor<i32>
   }
@@ -34,9 +34,9 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources
   func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) <{function = @func_with_resources, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) <{function = @func_with_resources, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
     %1 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
     return %0 : tensor<i32>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
index 7d225cec72502a..e79eb9f0b1fbab 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
@@ -5,7 +5,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func
   func.func @convert_cluster_func(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
-    // CHECK: "tf._XlaCompile"(%arg0) {function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
+    // CHECK: "tf._XlaCompile"(%arg0) <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 0>}> : (tensor<i32>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
     // CHECK: "tf._XlaRun"(%arg0, %0#0) : (tensor<i32>, tensor<3x!tf_type.string>) -> tensor<i32>
@@ -25,7 +25,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources_in_order
   func.func @convert_cluster_func_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) {function = @func_with_resources_in_order, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>)
+    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_in_order, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>)
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
@@ -45,14 +45,14 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources
   func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) {function = @func_with_resources_1, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
+    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_1, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources_1, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
     // CHECK: "tf_device.launch"()
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) {function = @func_with_resources_2, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
+    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_2, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %2#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
@@ -77,7 +77,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0"], tf.versions = {producer = 888 : i32}} {
   func.func @outside_compilation_in_generic_pipeline(%arg0: tensor<2xi32>) -> tensor<2xi32> {
     // CHECK: tf_device.launch
-    // CHECK: "tf._XlaCompile"() {function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 0, 0>}
+    // CHECK: "tf._XlaCompile"() <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 0, 0>}>
     // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: tf_device.parallel_execute
     // CHECK: tf_device.launch
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index d69de7494e4b5b..00ab20970f113d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -76,7 +76,7 @@ cc_library(
         ":bridge",
         ":tensorflow_passes",
         ":tf_saved_model_passes",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index d657fc940bc3b5..3e0c19fb87acc3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -31,10 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
-#include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
-#include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
-#include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h"
@@ -48,68 +44,9 @@ limitations under the License.
 
 namespace mlir {
 namespace TFTPU {
-namespace {
 
 constexpr char kBridgeComponent[] = "TFXLABridge";
 
-// Run the TF XLA Bridge based on the input pipeline, which can be either TPU
-// bridge pipeline or non TPU bridge pipeline.
-tensorflow::Status RunTFXLABridge(
-    ModuleOp module,
-    llvm::function_ref<void(OpPassManager &pm)> pipeline_builder,
-    llvm::StringRef module_name = llvm::StringRef()) {
-  // Explicitly check that the TensorFlow dialect can constant fold ops.
-  // Constant folding is essential for the bridge. Without this check, the
-  // bridge may fail with an error that is difficult to understand and not
-  // actionable.
-  if (!TF::TensorFlowDialect::HasConstantFoldHook()) {
-    return tensorflow::errors::Internal(
-        "TensorFlow dialect missing constant fold hook in TFXLA bridge phase "
-        "1; this could happen if the binary doesn't link the constant fold "
-        "hook registration library.");
-  }
-
-  PassManager bridge(module.getContext());
-  ::tensorflow::applyTensorflowAndCLOptions(bridge);
-
-  // Populate a passmanager with the list of passes that implement the bridge.
-  pipeline_builder(bridge);
-
-  mlir::StatusScopedDiagnosticHandler diag_handler(
-      module.getContext(), /*propagate=*/false,
-      /*filter_stack=*/!VLOG_IS_ON(1));
-
-  if (VLOG_IS_ON(1) ||
-      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
-    ::tensorflow::DumpMlirOpToFile(
-        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
-                                             "tf_xla_bridge_before"),
-        module, llvm::StringRef(), &bridge);
-  }
-
-  if (VLOG_IS_ON(2) ||
-      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(),
-                                      kDebugGroupBridgePhase1Clustering)) {
-    ::tensorflow::tf2xla::internal::EnablePassIRPrinting(
-        bridge, kDebugGroupBridgePhase1Clustering, module_name);
-  }
-
-  LogicalResult result = bridge.run(module);
-  (void)result;
-
-  if (VLOG_IS_ON(1) ||
-      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
-    ::tensorflow::DumpMlirOpToFile(
-        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
-                                             "tf_xla_bridge_after"),
-        module, llvm::StringRef(), &bridge);
-  }
-
-  return diag_handler.ConsumeStatus();
-}
-
-}  // namespace
-
 void CreateTPUBridgePipeline(OpPassManager &pm, llvm::StringRef module_name) {
   pm.addPass(CreateTPUValidateInputsPass());
   pm.addNestedPass<func::FuncOp>(
@@ -151,17 +88,5 @@ tensorflow::Status RunBridgeWithStandardPipeline(ModuleOp module,
   return diag_handler.ConsumeStatus();
 }
 
-void CreateTFXLABridgePipeline(OpPassManager &pm) {
-  tensorflow::tf2xla::internal::AddNonTPUBridgeClusteringPipelinePasses(pm);
-}
-
-tensorflow::Status RunTFXLABridge(ModuleOp module,
-                                  llvm::StringRef module_name) {
-  // CPU == GPU here, so both are equivalent.
-  return tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-      module, tensorflow::tf2xla::v2::XLA_GPU_JIT,
-      /*is_in_fallback_enabled_mode=*/false, module_name);
-}
-
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
index 72875f2143b75e..671235095b4d7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
@@ -40,11 +40,6 @@ ABSL_DEPRECATED(
 tensorflow::Status RunBridgeWithStandardPipeline(ModuleOp module,
                                                  bool enable_logging,
                                                  bool enable_inliner);
-
-// Runs all passes for non TPU (GPU and CPU) graph.
-ABSL_DEPRECATED("Use tf2xla::v2::RunFunctionTf2xlaClusteringBridge instead.")
-tensorflow::Status RunTFXLABridge(
-    ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
 }  // namespace TF
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index fcf2b1bca58552..359dd5c4624712 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -29,14 +29,20 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/platform:error_payloads",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:error_logging",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -56,6 +62,7 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
         "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
index 4dfe7c1fc19179..cba2b05cc2e78a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -29,10 +30,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/error_payloads.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
 #include "tsl/framework/device_type.h"
+#include "tsl/platform/error_logging.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace tfrt_compiler {
@@ -111,6 +117,39 @@ void CreateNonTPULowerClusterToRuntimeOpsPassPipeline(
   AddNonTPULowerClusterToRuntimeOpsPassPipeline(pm, /*module_name=*/"");
 }
 
+// TODO(b/306728216): Move this out of the Bridge component and into a Host
+// runtime component.
+tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
+                                       tsl::DeviceType device_type,
+                                       absl::Status status) {
+  if (status.ok()) {
+    return status;
+  }
+
+  VLOG(2) << error_prefix << " " << status;
+  tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+      device_type.type_string(), /*bridge_version=*/"v2",
+      /*fallback_enabled=*/false,
+      /*result=*/"failure");
+
+  constexpr char kBridgeComponent[] = "TFXLABridge";
+  std::string bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_TPU_BRIDGE";
+
+  tsl::OkOrSetErrorCounterPayload(
+      tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_1,
+      status);
+
+  if (device_type != DeviceType(DEVICE_TPU_XLA_JIT)) {
+    bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_CPU/GPU_BRIDGE";
+  }
+
+  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+                          status.ToString())
+      .IgnoreError();
+
+  return status;
+}
+
 absl::Status RunLowerClusterToRuntimeOpsPassPipeline(
     mlir::ModuleOp module, tsl::DeviceType xla_device_type,
     llvm::StringRef module_name) {
@@ -154,7 +193,12 @@ absl::Status RunLowerClusterToRuntimeOpsPassPipeline(
         module, llvm::StringRef(), &runtime_lowering);
   }
 
-  return diag_handler.ConsumeStatus();
+  auto result_status = diag_handler.ConsumeStatus();
+  TF_RETURN_IF_ERROR(
+      RecordIfErrorStatus(/*error_prefix=*/"lower_cluster_to_runtime",
+                          xla_device_type, result_status));
+
+  return absl::OkStatus();
 }
 
 // TODO(b/305211853): Unify the CPU/TPU/GPU Execution Ops and thus these two
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
index b58a13d679adb6..e0e376e200e992 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
@@ -51,6 +53,7 @@ using mlir::ModuleOp;
 using mlir::OpPassManager;
 using mlir::OwningOpRef;
 using mlir::func::FuncOp;
+using ::tensorflow::monitoring::testing::CellReader;
 using tsl::DeviceType;
 
 std::string TestDataPath() {
@@ -58,6 +61,9 @@ std::string TestDataPath() {
       "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/testdata/");
 }
 
+static constexpr char kCompilationStreamz[] =
+    "/tensorflow/core/tf_mlir_bridge_first_phase_count";
+
 class LowerClusterToRuntimeOpsTest : public ::testing::Test {
  public:
   LowerClusterToRuntimeOpsTest() {
@@ -154,11 +160,17 @@ TEST_F(LowerClusterToRuntimeOpsTest, LowersClusterOpsGPU) {
 }
 
 TEST_F(LowerClusterToRuntimeOpsTest, ErrorsWithBadCluster) {
+  CellReader<int64_t> compilation_status(kCompilationStreamz);
+
   TF_ASSERT_OK(CreateMlirModule("malformed_cluster.mlir"));
 
   EXPECT_FALSE(RunLowerClusterToRuntimeOpsPassPipeline(
                    *mlir_module_, DeviceType(DEVICE_TPU_XLA_JIT))
                    .ok());
+
+  EXPECT_EQ(compilation_status.Delta("XLA_TPU_JIT", "v2", "fallback_disabled",
+                                     "failure"),
+            1);
 }
 
 TEST_F(LowerClusterToRuntimeOpsTest, DumpsPipelinePasses) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
index aa1a74a1a5a1fb..523cdf290e6772 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
@@ -24,13 +24,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace tensorflow {
 
 void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
-  mlir::TF::CreateTFXLABridgePipeline(pm);
+  tensorflow::tf2xla::internal::AddNonTPUBridgeClusteringPipelinePasses(pm);
 
   // Remove unused global tensors, or make then immutable if possible.
   pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 247ea10dd68b50..caa76907a3ee45 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -318,10 +318,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreatePrintPass(
 // Moves TPUCompileMlir ops as far to the front as possible.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateMoveTpuCompileToFrontPass();
 
-// Populates the supplied passmanager with the passes required to run the
-// CPU/GPU bridge.
-void CreateTFXLABridgePipeline(OpPassManager& pm);
-
 //===----------------------------------------------------------------------===//
 // XlaCallModule
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index d81ee1e8e33b1a..78f2c3e0423124 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -17,9 +17,13 @@ limitations under the License.
 // the TensorFlow dialect to their functional counterparts, i.e.,
 // tf.IfRegion ->  tf.If and tf.WhileRegion -> tf.While
 
+#include <iterator>
+#include <memory>
 #include <optional>
+#include <string>
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -27,18 +31,21 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
@@ -69,6 +76,8 @@ struct RegionControlFlowToFunctional
                               CaseRegionOp case_region);
   LogicalResult ConvertWhileOp(SymbolTableCollection& symbol_table,
                                WhileRegionOp while_region);
+  LogicalResult ConvertGeneratorDatasetOp(SymbolTableCollection& symbol_table,
+                                          GeneratorDatasetRegionOp regional);
 
   // Get unique name by using the loc to name mapping.
   std::string GetName(Operation* op, StringRef suffix);
@@ -124,6 +133,37 @@ void CopyAndOverrideAttributes(Operation* src, Operation* dst,
   dst->setAttr(kXlaPropagateCompileTimeConsts, builder->getBoolAttr(true));
 }
 
+// If the region only does a single function call whose operands / returns match
+// exactly the block args and results, return the name of the called function.
+std::optional<StringRef> UnwrapSingleFunctionCall(Region& region) {
+  // The pattern we're matching is
+  // ^block(arg0, arg1, ..., argN):
+  //   r0, r1, ..., rN = func.call @foo(arg0, arg1, ..., argN)
+  //   "tf.yield"(r0, r1, ..., rN)
+  if (!region.hasOneBlock()) return std::nullopt;
+  Block& block = region.front();
+  if (std::distance(block.begin(), block.end()) != 2) return std::nullopt;
+  TF::YieldOp yield =
+      llvm::dyn_cast_or_null<TF::YieldOp>(block.getTerminator());
+  if (!yield) return std::nullopt;
+  func::CallOp call = llvm::dyn_cast_or_null<func::CallOp>(*block.begin());
+  if (!call) return std::nullopt;
+  if (block.getNumArguments() != call.getNumOperands() ||
+      call.getNumResults() != yield.getNumOperands())
+    return std::nullopt;
+  for (auto [arg, operand] :
+       llvm::zip(block.getArguments(), call.getOperands())) {
+    if (arg != operand) return std::nullopt;
+  }
+  for (auto [ret, operand] :
+       llvm::zip(call.getResults(), yield.getOperands())) {
+    if (ret != operand) return std::nullopt;
+  }
+  SymbolRefAttr symbol = call.getCallableForCallee().get<SymbolRefAttr>();
+  if (!symbol) return std::nullopt;
+  return symbol.getLeafReference();
+}
+
 // Extracts the contents of a region with a single block into a new function.
 // `extern_values` is the set of external values that the region refers to.
 // Returns the name of the newly created function.
@@ -135,7 +175,13 @@ StringRef ExtractSingleBlockRegion(
     SymbolTableCollection& symbol_table, Region& region, StringRef name,
     llvm::SmallVectorImpl<Value>& extern_values,
     llvm::SmallVectorImpl<func::FuncOp>& worklist,
-    bool extern_values_passthrough, bool only_one_return_value) {
+    bool extern_values_passthrough, bool only_one_return_value,
+    bool allow_return_of_existing = false) {
+  if (allow_return_of_existing && extern_values.empty()) {
+    auto existing = UnwrapSingleFunctionCall(region);
+    if (existing) return *existing;
+  }
+
   ModuleOp module = region.getParentOfType<ModuleOp>();
   auto builder = OpBuilder::atBlockBegin(module.getBody());
   auto loc = region.getParentOp()->getLoc();
@@ -524,6 +570,52 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   return success();
 }
 
+// Transform GeneratorDatasetRegion to GeneratorDatasetOp.
+LogicalResult RegionControlFlowToFunctional::ConvertGeneratorDatasetOp(
+    SymbolTableCollection& symbol_table, GeneratorDatasetRegionOp regional) {
+  mlir::MLIRContext* ctx = regional.getContext();
+  std::string init_name, next_name, finalize_name;
+
+  llvm::SmallVector<Value, 4> extern_values =
+      CollectExternValues(regional.getRegions());
+
+  if (!extern_values.empty()) return failure();
+
+  init_name = GetName(regional, "_init");
+  init_name = ExtractSingleBlockRegion(symbol_table, regional.getInit(),
+                                       init_name, extern_values, worklist,
+                                       /*extern_values_passthrough=*/false,
+                                       /*only_one_return_value=*/false,
+                                       /*allow_return_of_existing=*/true);
+
+  next_name = GetName(regional, "_next");
+  next_name = ExtractSingleBlockRegion(symbol_table, regional.getNext(),
+                                       next_name, extern_values, worklist,
+                                       /*extern_values_passthrough=*/false,
+                                       /*only_one_return_value=*/false,
+                                       /*allow_return_of_existing=*/true);
+
+  finalize_name = GetName(regional, "_finalize");
+  finalize_name =
+      ExtractSingleBlockRegion(symbol_table, regional.getFinalize(),
+                               finalize_name, extern_values, worklist,
+                               /*extern_values_passthrough=*/false,
+                               /*only_one_return_value=*/false,
+                               /*allow_return_of_existing=*/true);
+
+  auto new_op = OpBuilder(regional).create<TF::GeneratorDatasetOp>(
+      regional.getLoc(), regional->getResultTypes(),
+      regional.getInitFuncOtherArgs(), regional.getNextFuncOtherArgs(),
+      regional.getFinalizeFuncOtherArgs(), SymbolRefAttr::get(ctx, init_name),
+      SymbolRefAttr::get(ctx, next_name),
+      SymbolRefAttr::get(ctx, finalize_name), regional.getOutputTypes(),
+      regional.getOutputShapes(), regional.getMetadata());
+
+  regional->replaceAllUsesWith(new_op->getResults());
+  regional->erase();
+  return success();
+}
+
 void RegionControlFlowToFunctional::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTableCollection symbol_table;
@@ -549,6 +641,11 @@ void RegionControlFlowToFunctional::runOnOperation() {
           op->emitOpError() << "failed to convert to functional form";
           return WalkResult::interrupt();
         }
+      } else if (auto gen = llvm::dyn_cast<GeneratorDatasetRegionOp>(op)) {
+        if (failed(ConvertGeneratorDatasetOp(symbol_table, gen))) {
+          op->emitOpError() << "failed to convert to functional form";
+          return WalkResult::interrupt();
+        }
       }
       return WalkResult::advance();
     });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 581e04a3b8f643..ea51b928562a46 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -1257,7 +1257,7 @@ bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
 
   bool changed = false;
   for (auto [result, type] :
-       llvm::zip(op.getResults(), loader->output_types())) {
+       llvm::zip(op.getResults(), loader->OutputTypes())) {
     auto ranked = type.dyn_cast<RankedTensorType>();
     if (ranked == nullptr) {
       LLVM_DEBUG(llvm::dbgs()
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index bd581f689586ec..d0653b9677d5c7 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -233,14 +233,19 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
         "//tensorflow/core:framework",
+        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/lib/monitoring:counter",
+        "@local_tsl//tsl/platform:error_logging",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -255,6 +260,7 @@ tf_cc_test(
     deps = [
         ":tf_dialect_to_executor",
         "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
index 105b9016bb3c24..2f8469ee3f6f69 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
@@ -202,41 +202,6 @@ absl::Status RunClusteringPipelineOnSubmodule(
   return absl::OkStatus();
 }
 
-absl::Status RunLowerToRuntimeOpsOnSubmodule(ModuleOp parent_module,
-                                             bool is_in_fallback_enabled_mode) {
-  int num_submodules = 0;
-  absl::Status runtime_lowering_status;
-  parent_module.walk([&](ModuleOp submodule) {
-    if (submodule == parent_module) return mlir::WalkResult::advance();
-    num_submodules++;
-    runtime_lowering_status =
-        tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
-            submodule, tsl::DeviceType(DEVICE_TPU_XLA_JIT));
-    if (num_submodules > 1) {
-      return mlir::WalkResult::interrupt();
-    }
-
-    return mlir::WalkResult::advance();
-  });
-
-  if (num_submodules > 1) {
-    auto num_submodules_error = absl::InternalError(
-        "Lower to runtime has more than one submodule. Erroring out.");
-    TF_RETURN_IF_ERROR(RecordStatusIfError(
-        /*error_prefix=*/"V1 Lowering to runtime has more than one submodule:",
-        is_in_fallback_enabled_mode, num_submodules_error));
-  }
-
-  if (!runtime_lowering_status.ok()) {
-    TF_RETURN_IF_ERROR(RecordStatusIfError(
-        /*error_prefix=*/
-        "Errored running lowering cluster ops to runtime ops pipeline:",
-        is_in_fallback_enabled_mode, runtime_lowering_status));
-  }
-
-  return absl::OkStatus();
-}
-
 tensorflow::Status RunSessionTf2xlaClusteringBridge(
     ModuleOp module, bool is_in_fallback_enabled_mode) {
   VLOG(2) << "TPU Sessions Bridge called stack trace is "
@@ -254,35 +219,12 @@ tensorflow::Status RunSessionTf2xlaClusteringBridge(
   TF_RETURN_IF_ERROR(
       RunClusteringPipelineOnSubmodule(module, is_in_fallback_enabled_mode));
 
-  TF_RETURN_IF_ERROR(
-      RunLowerToRuntimeOpsOnSubmodule(module, is_in_fallback_enabled_mode));
-
-  Status export_preparation_status = RunTFXLABridge(
-      module,
-      [](OpPassManager &pm) {
-        pm.addPass(
-            mlir::tf_executor::CreateTFExecutorTPUV1IslandInliningPass());
-        // There are cases where we don't consume all compilation and
-        // replication attributes like we do for the V2 pipeline, so we need to
-        // convert them from unified to legacy attributes before they get
-        // exposed to outside of the bridge.
-        pm.addNestedPass<FuncOp>(
-            mlir::TFTPU::
-                CreateConvertToLegacyCompileAndReplicateAttributesPass());
-      },
-      /*module_name=*/"",
-      /*dump_prefix=*/"tf_xla_bridge_v1_export_preparation");
-
-  TF_RETURN_IF_ERROR(RecordStatusIfError(
-      /*error_prefix=*/"Bridge Export Preparation Failed:",
-      is_in_fallback_enabled_mode, export_preparation_status));
-
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       /*device_type=*/"tpu", /*bridge_version=*/"v1",
       /*n_fallback_enabled*/ is_in_fallback_enabled_mode,
       /*result=*/"success");
 
-  return tensorflow::tf2xla::v1::ExportFromTensorflowDialectToExecutor(module);
+  return absl::OkStatus();
 }
 
 // Registers a pipeline builder function for TF TPU V1 bridge.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
index 441e509ed187db..e27ec14770b2be 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h
@@ -29,11 +29,11 @@ namespace v1 {
 // These transformations take as input a Tensorflow Graph as an MLIR Module
 // and transforms the module in place to cluster the given ops for compilation
 // that is compatible with the given device_type. The MLIR should be in the TF
-// Executor Dialect for graph nodes and edges. Individual Op inside a node
-// should be the Tensorflow Dialect. The output MLIR is in the TF Executor
-// Dialect.  The input MLIR should not have infeed and outfeed ops, which are
-// unsupported via this API.
-// Returns OkStatus if passed, otherwise an error.
+// Executor Dialect for graph nodes and edges or TF Functional. It will convert
+// to TF Functional internally. Individual Op inside a node should be the
+// Tensorflow Dialect. The output MLIR is in the TF Functional Dialect.  The
+// input MLIR should not have infeed and outfeed ops, which are unsupported via
+// this API. Returns OkStatus if passed, otherwise an error.
 tensorflow::Status RunSessionTf2xlaClusteringBridge(
     mlir::ModuleOp module, bool is_in_fallback_enabled_mode);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
index 3b195bd3977fe1..44eafb25f579c8 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -88,18 +88,6 @@ TEST_F(SessionClusterTensorflowDialectTest, ClustersTf) {
       compilation_status.Delta("tpu", "v1", "fallback_disabled", "success"), 1);
 }
 
-// Required for now due to the Bridge API, but this should be separated out
-// later.
-TEST_F(SessionClusterTensorflowDialectTest,
-       RunsTensorflowDialectToTensorflowExecutor) {
-  TF_ASSERT_OK(CreateMlirModule("invalid_executor.mlir"));
-
-  EXPECT_FALSE(
-      RunSessionTf2xlaClusteringBridge(*mlir_module_,
-                                       /*is_in_fallback_enabled_mode=*/false)
-          .ok());
-}
-
 TEST_F(SessionClusterTensorflowDialectTest, FailsWithMultipleSubmodules) {
   CellReader<int64_t> compilation_status(kCompilationStreamz);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
index 941dd362a6c8d4..236282f625e20a 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -32,8 +34,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h"
+#include "tensorflow/core/platform/error_payloads.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
+#include "tsl/lib/monitoring/counter.h"
+#include "tsl/platform/error_logging.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
@@ -47,6 +52,15 @@ using mlir::Pass;
 using mlir::PassManager;
 using mlir::func::FuncOp;
 
+auto *tf_dialect_to_executor_dialect_status = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/tf2xla/api/v1/tf_dialect_to_executor_dialect_status",
+    "Counts how often a successful export from TF Dialect to Executor Dialect "
+    "is",
+    "status");
+
+constexpr char kExportSuccess[] = "success";
+constexpr char kExportFailed[] = "failed";
+
 namespace {
 
 void AddTfDialectToExecutorPasses(OpPassManager &pm) {
@@ -55,6 +69,14 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
     pm.addPass(mlir::CreateBreakUpIslandsPass());
   };
 
+  pm.addPass(mlir::tf_executor::CreateTFExecutorTPUV1IslandInliningPass());
+  // There are cases where we don't consume all compilation and
+  // replication attributes like we do for the V2 pipeline, so we need to
+  // convert them from unified to legacy attributes before they get
+  // exposed to outside of the bridge.
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateConvertToLegacyCompileAndReplicateAttributesPass());
+
   pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
   add_pass(mlir::CreateFunctionalToExecutorDialectConversionPass());
   add_pass(mlir::TFDevice::CreateReplicateToIslandPass(
@@ -75,6 +97,30 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
   pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
 }
 
+tensorflow::Status RecordStatusIfError(absl::Status status) {
+  if (status.ok()) {
+    return absl::OkStatus();
+  }
+
+  VLOG(1) << "Failed to export from TF Dialect to TF Executor Dialect. "
+          << status;
+  tf_dialect_to_executor_dialect_status->GetCell(kExportFailed)->IncrementBy(1);
+
+  constexpr char bridge_subcomponent[] =
+      "TFXLA_TF_FUNCTIONAL_TO_EXECUTOR_EXPORT_v1";
+  constexpr char kBridgeComponent[] = "TFXLABridge";
+
+  tsl::OkOrSetErrorCounterPayload(
+      tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_1,
+      status);
+
+  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+                          status.ToString())
+      .IgnoreError();
+
+  return status;
+}
+
 }  // namespace
 
 tensorflow::Status ExportFromTensorflowDialectToExecutor(
@@ -116,6 +162,13 @@ tensorflow::Status ExportFromTensorflowDialectToExecutor(
         module, llvm::StringRef(), &tf_to_executor);
   }
 
+  if (result.failed()) {
+    return RecordStatusIfError(diag_handler.ConsumeStatus());
+  }
+
+  tf_dialect_to_executor_dialect_status->GetCell(kExportSuccess)
+      ->IncrementBy(1);
+
   return diag_handler.ConsumeStatus();
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
index 80a770169a88b2..38393d3753146e 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 
+#include <cstdint>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status.h"
@@ -41,6 +43,10 @@ using mlir::DialectRegistry;
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::OwningOpRef;
+using tensorflow::monitoring::testing::CellReader;
+
+static constexpr char kCompilationStreamz[] =
+    "/tensorflow/core/tf2xla/api/v1/tf_dialect_to_executor_dialect_status";
 
 std::string TestDataPath() {
   return tensorflow::GetDataDependencyFilepath(
@@ -73,15 +79,23 @@ class TensorflowDialectToExecutorTest : public ::testing::Test {
 };
 
 TEST_F(TensorflowDialectToExecutorTest, ConvertsToExecutor) {
+  CellReader<int64_t> compilation_status(kCompilationStreamz);
+
   TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
 
   TF_EXPECT_OK(ExportFromTensorflowDialectToExecutor(*mlir_module_));
+
+  EXPECT_EQ(compilation_status.Delta("success"), 1);
 }
 
 TEST_F(TensorflowDialectToExecutorTest, ErrorsWhenCannotConvert) {
+  CellReader<int64_t> compilation_status(kCompilationStreamz);
+
   TF_ASSERT_OK(CreateMlirModule("invalid_executor.mlir"));
 
   EXPECT_FALSE(ExportFromTensorflowDialectToExecutor(*mlir_module_).ok());
+
+  EXPECT_EQ(compilation_status.Delta("failed"), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index b673432a9f28b0..03445b8a9c341d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -171,7 +171,9 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
         "//tensorflow/core:framework",
+        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -180,6 +182,8 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/lib/monitoring:counter",
+        "@local_tsl//tsl/platform:error_logging",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
index b07d0340705e01..4d507cf22625ed 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -200,19 +200,6 @@ tensorflow::Status RunNonTPUBridge(ModuleOp module,
                                          is_in_fallback_enabled_mode,
                                          device_type, clustering_status));
 
-  Status runtime_lowering_status =
-      tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
-          module, tsl::DeviceType("XLA_GPU_JIT"), module_name);
-  TF_RETURN_IF_ERROR(RecordIfErrorStatus(/*error_prefix=*/"runtime_lowering_v2",
-                                         is_in_fallback_enabled_mode,
-                                         device_type, runtime_lowering_status));
-
-  Status export_status =
-      tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(
-          module, module_name);
-  TF_RETURN_IF_ERROR(RecordIfErrorStatus(/*error_prefix=*/"export_to_executor",
-                                         is_in_fallback_enabled_mode,
-                                         device_type, export_status));
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       device_type, /*bridge_version=*/"v2", is_in_fallback_enabled_mode,
       /*result=*/"success");
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
index d597ca89e8bb23..e1298ac53560d3 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
@@ -31,13 +31,14 @@ namespace v2 {
 // API. These transformations take as input a Tensorflow Graph as an MLIR Module
 // and transforms the module in place to cluster the given ops for compilation
 // that is compatible with the given device_type. The MLIR should be in the TF
-// Executor Dialect for graph nodes and edges. Individual Op inside a node
-// should be the Tensorflow Dialect. The output MLIR is in the TF Executor
-// Dialect. Returns OkStatus if passed, otherwise an error.
+// Executor Dialect for graph nodes and edges or be in TF Functional already.
+// Individual Op inside a node should be the Tensorflow Functional Dialect. The
+// output MLIR is in the TF Functional Dialect. Returns OkStatus if passed,
+// otherwise an error.
 //
 // Inputs:
 //   module - The MLIR Module that will be clustered. Expected to be in TF
-//   Executor Dialect
+//   Executor Dialect or TF Functional Dialect. Will convert to TF Functional.
 // . device_type - The device type to cluster for.
 //   is_in_fallback_enabled_mode - Whether this was called with fallback to the
 //   non-MLIR Bridge. This is just for logging purposes and doesn't affect
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index e9a6de55693f8f..d00d8b43d9e790 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 
+#include <cstdint>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -132,13 +133,6 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFCPU) {
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
 
-  bool has_graph_op = false;
-  main.walk([&](mlir::tf_executor::GraphOp graph) {
-    has_graph_op = true;
-    return WalkResult::advance();
-  });
-
-  EXPECT_TRUE(has_graph_op);
   EXPECT_EQ(
       compilation_status.Delta("cpu/gpu", "v2", "fallback_disabled", "success"),
       1);
@@ -156,14 +150,6 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFGPU) {
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
 
-  bool has_graph_op = false;
-  main.walk([&](mlir::tf_executor::GraphOp graph) {
-    has_graph_op = true;
-    return WalkResult::advance();
-  });
-
-  EXPECT_TRUE(has_graph_op);
-
   EXPECT_EQ(
       compilation_status.Delta("cpu/gpu", "v2", "fallback_disabled", "success"),
       1);
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index da42af23d62772..69f1c0e20a5e1b 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -31,9 +32,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h"
+#include "tensorflow/core/platform/error_payloads.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
 #include "tsl/lib/monitoring/counter.h"
+#include "tsl/platform/error_logging.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
@@ -93,6 +96,30 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
   pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
 }
 
+tensorflow::Status RecordStatusIfError(absl::Status status) {
+  if (status.ok()) {
+    return absl::OkStatus();
+  }
+
+  tf_dialect_to_executor_dialect_status->GetCell(kExportFailed)->IncrementBy(1);
+  VLOG(1) << "Failed to export from TF Dialect to TF Executor Dialect. "
+          << status;
+
+  constexpr char bridge_subcomponent[] =
+      "TFXLA_TF_FUNCTIONAL_TO_EXECUTOR_EXPORT_v2";
+  constexpr char kBridgeComponent[] = "TFXLABridge";
+
+  tsl::OkOrSetErrorCounterPayload(
+      tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_1,
+      status);
+
+  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+                          status.ToString())
+      .IgnoreError();
+
+  return status;
+}
+
 }  // namespace
 
 tensorflow::Status ExportFromTensorflowDialectToExecutor(
@@ -128,12 +155,10 @@ tensorflow::Status ExportFromTensorflowDialectToExecutor(
         module, llvm::StringRef(), &tf_to_executor);
   }
 
-  if (!result.succeeded()) {
-    tf_dialect_to_executor_dialect_status->GetCell(kExportFailed)
-        ->IncrementBy(1);
-
-    return absl::InternalError(
-        "Failed to export from TF Dialect to TF Executor Dialect.");
+  if (result.failed()) {
+    return RecordStatusIfError(
+        absl::InternalError("Failed to export from TF Dialect to TF Executor "
+                            "Dialect. Read LLVM Pipeline Error"));
   }
 
   tf_dialect_to_executor_dialect_status->GetCell(kExportSuccess)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
new file mode 100644
index 00000000000000..9abed32b2fac8b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -0,0 +1,95 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "clustering_passes",
+    srcs = [
+        "verify_clustering_pass.cc",
+    ],
+    hdrs = [
+        "clustering_passes.h",
+    ],
+    textual_hdrs = [
+        "clustering_passes.h.inc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/transforms/toposort:Pass",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+gentbl_cc_library(
+    name = "clustering_passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=TFXLABridge",
+            ],
+            "clustering_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "clustering_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+tf_cc_test(
+    name = "verify_clustering_pass_test",
+    srcs = ["verify_clustering_pass_test.cc"],
+    deps = [
+        ":clustering_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:test_utils",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+glob_lit_tests(
+    name = "all_tests",
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir:tf-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
new file mode 100644
index 00000000000000..0e477c86b5405b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+// Verifies that all MLIR Ops have the expected attributes.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateVerifyClusteringPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_VERIFYCLUSTERINGPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_CLUSTERING_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
new file mode 100644
index 00000000000000..c1431369c6e0f3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
@@ -0,0 +1,26 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+include "mlir/Pass/PassBase.td"
+
+def VerifyClusteringPass : Pass<"verify-clustering-pass", "mlir::func::FuncOp"> {
+
+  let summary = "Verify that the Bridge output is correct and errors if verification fails.";
+
+  let description = [{
+    Verifies whether clustering has resulted in the expected invariants. These
+    include verifying that clusters have been created and have been outside
+    compiled, the result is device agnostic and in TF functional dialect &
+    that the device attribute exists.
+  }];
+
+  let constructor = "tensorflow::tf2xla::internal::CreateVerifyClusteringPass()";
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc
new file mode 100644
index 00000000000000..2c053dbbb245e5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <set>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+namespace {
+
+#define GEN_PASS_DEF_VERIFYCLUSTERINGPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
+
+class VerifyClusteringPass
+    : public impl::VerifyClusteringPassBase<VerifyClusteringPass> {
+ public:
+  void runOnOperation() override;
+};
+
+void VerifyClusteringPass::runOnOperation() {
+  std::set<std::string> valid_namespaces = {"tf", "func", "return"};
+  mlir::Operation* func_op = getOperation();
+
+  auto walk_result = func_op->walk([&](mlir::Operation* op) {
+    if (valid_namespaces.find(op->getDialect()->getNamespace().str()) ==
+        valid_namespaces.end()) {
+      std::string error = "op is in dialect " +
+                          op->getDialect()->getNamespace().str() +
+                          " not in tf functional dialect";
+      op->emitError() << error;
+      return mlir::WalkResult::interrupt();
+    }
+    return mlir::WalkResult::advance();
+  });
+
+  if (walk_result.wasInterrupted()) {
+    signalPassFailure();
+  }
+}
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateVerifyClusteringPass() {
+  return std::make_unique<VerifyClusteringPass>();
+}
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc
new file mode 100644
index 00000000000000..6767a00c2fb4a6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+namespace {
+
+using mlir::mhlo::test::GetMlirModuleFromString;
+
+class VerifyClusteringPassTest : public testing::Test {
+ protected:
+  void CreateModule(const char* module_string) {
+    TF_ASSERT_OK_AND_ASSIGN(module_,
+                            GetMlirModuleFromString(module_string, &context_));
+    pm_ = std::make_unique<mlir::PassManager>(&context_);
+    pm_->addNestedPass<mlir::func::FuncOp>(CreateVerifyClusteringPass());
+  }
+
+  mlir::LogicalResult Run() { return pm_->run(module_.get()); }
+
+ private:
+  mlir::MLIRContext context_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  std::unique_ptr<mlir::PassManager> pm_;
+};
+
+TEST_F(VerifyClusteringPassTest, OnlyTfFunctionalPasses) {
+  static constexpr char kMlirModuleStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> tensor<1xi32> {
+      %0 = "tf.Const"() {value = dense<1000> : tensor<1xi32>} : () -> tensor<1xi32>
+      return %0 : tensor<1xi32>
+    }
+  })";
+  CreateModule(kMlirModuleStr);
+
+  auto result = Run();
+
+  EXPECT_TRUE(result.succeeded());
+}
+
+TEST_F(VerifyClusteringPassTest, NotTfFunctionalFails) {
+  static constexpr char kMlirModuleStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> tensor<3x32x32x3xf32> {
+      %0 = mhlo.constant dense<2.550000e+02> : tensor<3x32x32x3xf32>
+      return %0 : tensor<3x32x32x3xf32>
+    }
+  })";
+  CreateModule(kMlirModuleStr);
+
+  auto result = Run();
+
+  EXPECT_TRUE(result.failed());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
new file mode 100644
index 00000000000000..23e60242621f37
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-opt -verify-clustering-pass  -split-input-file -verify-diagnostics %s | FileCheck %s
+// Tests the VerifyClusteringPass Pass, ensures that an error is thrown when validation fails.
+
+func.func @testNotTfDialect(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+ // expected-error@below {{op is in dialect chlo not in tf functional dialect}}
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  func.return %0 : tensor<1x32x10x32xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testTFDialect
+func.func @testTFDialect(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string> {
+  %0 = "tf.Identity"(%arg0) : (tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string>
+  func.return %0 : tensor<4x2x!tf_type.string>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
index 30dc8ecbbf647b..01dc4701923675 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
@@ -266,28 +266,28 @@ func.func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tens
 
 // CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
 func.func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = true}
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = true}>
   %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @equal_incompatible_shape_broadcastable
 func.func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = true}
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = true}>
   %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   func.return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_incompatible_shape_dynamic
 func.func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = false}>
   %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
   func.return %0: tensor<*xi1>
 }
 
 // CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
 func.func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) <{incompatible_shape_error = false}>
   %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
   func.return %0: tensor<*xi1>
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
index 32ef8cba3b832c..a732c6d61281ca 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
@@ -701,9 +701,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
   // Verifies that the following functions are added from xla_call_module. Note this must be at the end of the file.
   // CHECK: func.func private @main.2(%arg0: tensor<f32> {mhlo.sharding = "{replicated}"}) -> tensor<f32> {
-  // CHECK:   %0 = mhlo.bitcast_convert %arg0 : (tensor<f32>) -> tensor<f32> 
-  // CHECK:   %1 = mhlo.sine %0 : tensor<f32>
-  // CHECK:   return %1 : tensor<f32>
+  // CHECK:   %0 = mhlo.sine %arg0 : tensor<f32>
+  // CHECK:   return %0 : tensor<f32>
   // CHECK: }
 
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_cpu.mlir b/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_cpu.mlir
index c06ba74641ae09..67aa69d5806aaf 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_cpu.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_cpu.mlir
@@ -4,9 +4,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 // CHECK-LABEL: stateless_op
 func.func @stateless_op() -> tensor<i32> {
-  // CHECK: %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %0 = "tf.StatelessRandomGetAlg"() {device = ""} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
 
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_gpu.mlir b/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_gpu.mlir
index f051960fb5a2c0..4d5da14a519b61 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_gpu.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/tfxla_device_specific_transformations_gpu.mlir
@@ -4,9 +4,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 // CHECK-LABEL: stateless_op
 func.func @stateless_op() -> tensor<i32> {
-  // CHECK: %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
   %0 = "tf.StatelessRandomGetAlg"() {device = ""} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
 
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index b4f5d00b7a9e37..ed0429ad242c94 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #    TF2XLA Bridge transforms
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 
 package(
@@ -487,13 +487,12 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 24f7711539a5b4..ade2b5faa73c8a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -20,21 +20,22 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
-#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -131,7 +132,7 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
   EXPECT_EQ(tf2xla_fallback_count, 315);
-  EXPECT_EQ(non_categorized_count, 420);
+  EXPECT_EQ(non_categorized_count, 422);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 4a454c326550ae..f803230ea4f504 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tosa/tf_passes.h"
 #include "tensorflow/compiler/mlir/tosa/tf_tfl_passes.h"
@@ -55,6 +56,7 @@ int main(int argc, char **argv) {
   mlir::mhlo::registerLegalizeTfPasses();
   mlir::mhlo::registerTfXlaPasses();
   mlir::quant::stablehlo::registerBridgePasses();
+  tensorflow::tf2xla::internal::registerTFXLABridgePasses();
   mlir::tosa::registerLegalizeTosaPasses();
   mlir::tosa::registerTFtoTOSALegalizationPipeline();
   mlir::tosa::registerTFLtoTOSALegalizationPipeline();
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
index ee79b37a73d1cc..135bc20b970bc5 100644
--- a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
@@ -81,6 +81,7 @@ distribute_py_strict_test(
         "notap",  # The test is too long to run as part of llvm presubmits (b/173661843).
         "notpu",  # Takes too long (b/192305423)
         "notsan",  # Not needed, and there were issues with timeouts.
+        "requires-net:external",
     ],
 
     # TODO(b/175056184): Re-enable xla_enable_strict_auto_jit once the issues
diff --git a/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
index 77508b60046151..912385f36b6f85 100644
--- a/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
@@ -41,7 +41,7 @@ func.func @constant_tensor_array() -> !tfr.tensor {
   %1 = "tfr.constant_tensor"(%0) : (!tfr.attr) -> !tfr.tensor
   func.return %1 : !tfr.tensor
 
-// CHECK-NEXT: %[[RES:.*]] = "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-NEXT: %[[RES:.*]] = "tf.Const"() <{value = dense<[1, -1, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK-NEXT: "tfr.cast"(%[[RES]]) : (tensor<3xi64>) -> !tfr.tensor
 // CHECK-NEXT: return
 }
@@ -54,7 +54,7 @@ func.func @constant_tensor_scalar() -> !tfr.tensor {
   %1 = "tfr.constant_tensor"(%0) : (i32) -> !tfr.tensor
   func.return %1 : !tfr.tensor
 
-// CHECK-NEXT: %[[RES:.*]] = "tf.Const"() {value = dense<42> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %[[RES:.*]] = "tf.Const"() <{value = dense<42> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT: "tfr.cast"(%[[RES]]) : (tensor<i32>) -> !tfr.tensor
 // CHECK-NEXT: return
 }
@@ -83,7 +83,7 @@ func.func @quant_raw_data(%arg0: tensor<1x10x!quant.uniform<i8:f32, 0.1:1>>) ->
 
 // CHECK-LABEL:  quant_raw_data_with_list
 func.func @quant_raw_data_with_list(%arg0: !tfr.tensor, %arg1: !tfr.tensor) -> !tfr.tensor {
-  %cst_1 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %cst_1 = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   %1 = "tfr.cast"(%arg0) : (!tfr.tensor) -> tensor<1x4x4x3x!quant.uniform<i8:f32, 0.0078420601785182952:-1>>
   %2 = "tfr.cast"(%arg1) : (!tfr.tensor) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0078420601785182952:-1>>
   %3 = "tfr.cast"(%2) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0078420601785182952:-1>>) -> !tfr.tensor
@@ -94,7 +94,7 @@ func.func @quant_raw_data_with_list(%arg0: !tfr.tensor, %arg1: !tfr.tensor) -> !
   %8 = tfr.call @tf__concat(%7, %6) : (!tfr.tensor, !tfr.tensor_list) -> !tfr.tensor
   func.return %8 : !tfr.tensor
 
-// CHECK: %[[CONST_0:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
 // CHECK: %[[BUILD_LIST_0:.*]] = "tfr.build_list"(%arg1, %arg0) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
 // CHECK: %[[CAST_0:.*]] = "tfr.cast"(%[[CONST_0]]) : (tensor<i64>) -> !tfr.tensor
 // CHECK: %[[CONCAT_O:.*]] = tfr.call @tf__concat(%[[CAST_0]], %[[BUILD_LIST_0]]) : (!tfr.tensor, !tfr.tensor_list) -> !tfr.tensor
@@ -131,8 +131,8 @@ func.func @quant_qparam(%arg0: tensor<1x10x!quant.uniform<i8:f32, 0.1:42>>) -> (
   %2 = "tfr.cast"(%zp) : (!tfr.tensor) -> tensor<i32>
   func.return %1, %2 : tensor<f32>, tensor<i32>
 
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<1.000000e-01> : tensor<f32>}
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<42> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<1.000000e-01> : tensor<f32>}>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<42> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: return %[[scale]], %[[zp]]
 }
 
@@ -144,8 +144,8 @@ func.func @quant_qparam_per_channel(%arg0: tensor<1x3x!quant.uniform<i8:f32:1, {
   %2 = "tfr.cast"(%zp) : (!tfr.tensor) -> tensor<3xi32>
   func.return %1, %2 : tensor<3xf32>, tensor<3xi32>
 
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<[1.000000e-01, 2.000000e-01, 3.000000e-01]> : tensor<3xf32>}
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<[1.000000e-01, 2.000000e-01, 3.000000e-01]> : tensor<3xf32>}>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK: return %[[scale]], %[[zp]]
 }
 
@@ -168,9 +168,9 @@ func.func @redundant_cast_with_different_element_type(%arg0: tensor<*xf32>) -> (
   %2 = "tfr.cast"(%0) : (!tfr.tensor) -> tensor<2xi32>
   func.return %1, %2 : tensor<*xi32>, tensor<2xi32>
 
-// CHECK: %[[tf_cast_unranked:.*]] = "tf.Cast"(%arg0) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi32>
-// CHECK: %[[ensure_shape:.*]] = "tf.EnsureShape"(%arg0) {shape = #tf_type.shape<2>} : (tensor<*xf32>) -> tensor<2xf32>
-// CHECK: %[[tf_cast_ranked:.*]] = "tf.Cast"(%[[ensure_shape]]) {Truncate = false} : (tensor<2xf32>) -> tensor<2xi32>
+// CHECK: %[[tf_cast_unranked:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi32>
+// CHECK: %[[ensure_shape:.*]] = "tf.EnsureShape"(%arg0) <{shape = #tf_type.shape<2>}> : (tensor<*xf32>) -> tensor<2xf32>
+// CHECK: %[[tf_cast_ranked:.*]] = "tf.Cast"(%[[ensure_shape]]) <{Truncate = false}> : (tensor<2xf32>) -> tensor<2xi32>
 // CHECK: return %[[tf_cast_unranked]], %[[tf_cast_ranked]] :  tensor<*xi32>, tensor<2xi32>
 }
 
@@ -185,7 +185,7 @@ func.func @redundant_cast_with_quant_type(%arg0: tensor<10x!quant.uniform<i8:f32
   func.return %3 : tensor<10xi32>
 // CHECK: %[[CAST_0:.*]] = "tfr.cast"(%arg0) : (tensor<10x!quant.uniform<i8:f32, 0.0039133410900831223:-128>>) -> !tfr.tensor
 // CHECK: %[[CAST_1:.*]] = "tfr.cast"(%[[CAST_0]]) : (!tfr.tensor) -> tensor<10xi8>
-// CHECK: %[[CAST_2:.*]] = "tf.Cast"(%[[CAST_1]]) {Truncate = false} : (tensor<10xi8>) -> tensor<10xi32>
+// CHECK: %[[CAST_2:.*]] = "tf.Cast"(%[[CAST_1]]) <{Truncate = false}> : (tensor<10xi8>) -> tensor<10xi32>
 // CHECK: return %[[CAST_2]] : tensor<10xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/tfr/tests/decompose.mlir b/tensorflow/compiler/mlir/tfr/tests/decompose.mlir
index 0dcd363164f760..eb35c3a30e79e6 100644
--- a/tensorflow/compiler/mlir/tfr/tests/decompose.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/decompose.mlir
@@ -227,8 +227,8 @@ func.func @decompose_quant_scale_factor() -> (!tfr.tensor, !tfr.tensor) {
   %list2 = "tfr.build_list"(%input_scale_tensor, %perchannel_scale_tensor) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
   %perchannel = "tfr.quant_scale_factor"(%output_scale, %list2) : (f32, !tfr.tensor_list) -> !tfr.tensor
   func.return %out, %perchannel : !tfr.tensor, !tfr.tensor
-// CHECK-DAG: %[[scale_factors:.*]] = "tf.Const"() {value = dense<[1.000000e+00, 1.000000e+01]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: %[[scale_factor:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[scale_factors:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 1.000000e+01]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[scale_factor:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK: %[[cast:.*]] = "tfr.cast"(%[[scale_factor]]) : (tensor<f32>) -> !tfr.tensor
 // CHECK: %[[cast_perchannel:.*]] = "tfr.cast"(%[[scale_factors]]) : (tensor<2xf32>) -> !tfr.tensor
 // CHECK: return %[[cast]], %[[cast_perchannel]] : !tfr.tensor, !tfr.tensor
@@ -245,8 +245,8 @@ func.func @decompose_quant_scale_factor_invalid() -> !tfr.tensor {
   %out = "tfr.quant_scale_factor"(%output_scale, %list) : (f32, !tfr.tensor_list) -> !tfr.tensor
   func.return %out : !tfr.tensor
 // CHECK-DAG: %[[cst_0:.*]] = arith.constant 1.000000e-01 : f32
-// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<2.500000e-01> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<2.500000e-01> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[cst_2:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<f32>}> : () -> tensor<f32>
 // CHECK: %[[tfrcast0:.*]] = "tfr.cast"(%[[cst_1]]) : (tensor<f32>) -> !tfr.tensor
 // CHECK: %[[tfrcast1:.*]] = "tfr.cast"(%[[cst_2]]) : (tensor<f32>) -> !tfr.tensor
 // CHECK: %[[list:.*]] = "tfr.build_list"(%[[tfrcast0]], %[[tfrcast1]], %[[tfrcast0]]) : (!tfr.tensor, !tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
@@ -265,9 +265,9 @@ func.func @decompose_quant_rescale(%arg0: tensor<2xi32>) -> !tfr.tensor {
 
 // CHECK-DAG: %[[f32:.*]] = tfr.constant f32 -> !tfr.attr
 // CHECK-DAG: %[[i32:.*]] = tfr.constant i32 -> !tfr.attr
-// CHECK-DAG: %[[scale_cst:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[scale_cst:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK-DAG: %false = arith.constant false
-// CHECK-DAG: %[[zp_cst:.*]] = "tf.Const"() {value = dense<67> : tensor<i64>} : () -> tensor<i64>
+// CHECK-DAG: %[[zp_cst:.*]] = "tf.Const"() <{value = dense<67> : tensor<i64>}> : () -> tensor<i64>
 // CHECK: %[[zp:.*]] = "tfr.cast"(%[[zp_cst]]) : (tensor<i64>) -> !tfr.tensor
 // CHECK: %[[scale:.*]] = "tfr.cast"(%[[scale_cst]]) : (tensor<f32>) -> !tfr.tensor
 // CHECK: %[[input:.*]] = "tfr.cast"(%arg0) : (tensor<2xi32>) -> !tfr.tensor
diff --git a/tensorflow/compiler/mlir/tfr/tests/end2end.mlir b/tensorflow/compiler/mlir/tfr/tests/end2end.mlir
index 6a49a0dbbb1d13..0654b216e0f165 100644
--- a/tensorflow/compiler/mlir/tfr/tests/end2end.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/end2end.mlir
@@ -17,7 +17,7 @@ func.func @my_rsqrt(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
 
 // CHECK-NEXT: %[[RE:.*]] = "tf.RiscReciprocal"(%arg0) : (tensor<2x3xf32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[SQRT:.*]] = "tf.RiscSqrt"(%[[RE]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[SQRT]]) {shape = #tf_type.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[SQRT]]) <{shape = #tf_type.shape<3x2x3>}> : (tensor<*xf32>) -> tensor<3x2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
 }
 
@@ -26,11 +26,11 @@ func.func @my_leaky_relu(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
   %0 = "tf.MyLeakyRelu"(%arg0) {alpha=3.0 : f32} : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
   func.return %0 : tensor<3x2x3xf32>
 
-// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK-NEXT: %[[SHAPE:.*]] = "tf.RiscShape"(%arg0) {T = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
 // CHECK-NEXT: %[[ALPHA1:.*]] = "tf.RiscBroadcast"(%[[ALPHA]], %[[SHAPE]]) : (tensor<f32>, tensor<*xi32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[MAX:.*]] = "tf.RiscMaximum"(%arg0, %[[ALPHA1]]) : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) {shape = #tf_type.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) <{shape = #tf_type.shape<3x2x3>}> : (tensor<*xf32>) -> tensor<3x2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
 }
 
@@ -39,11 +39,11 @@ func.func @my_leaky_relu_with_default(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf3
   %0 = "tf.MyLeakyRelu"(%arg0) : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
   func.return %0 : tensor<3x2x3xf32>
 
-// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<f32>} : () -> tensor<f32>
+// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<f32>}> : () -> tensor<f32>
 // CHECK-NEXT: %[[SHAPE:.*]] = "tf.RiscShape"(%arg0) {T = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
 // CHECK-NEXT: %[[ALPHA1:.*]] = "tf.RiscBroadcast"(%[[ALPHA]], %[[SHAPE]]) : (tensor<f32>, tensor<*xi32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[MAX:.*]] = "tf.RiscMaximum"(%arg0, %[[ALPHA1]]) : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) {shape = #tf_type.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) <{shape = #tf_type.shape<3x2x3>}> : (tensor<*xf32>) -> tensor<3x2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
 }
 
@@ -53,7 +53,7 @@ func.func @my_cast(%arg0: tensor<2x3xf32>) -> tensor<2x3xi32> {
   func.return %0 : tensor<2x3xi32>
 
 // CHECK-NEXT: %[[CAST:.*]] = "tf.RiscCast"(%arg0) {Tout = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CAST]]) {shape = #tf_type.shape<2x3>} : (tensor<*xi32>) -> tensor<2x3xi32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CAST]]) <{shape = #tf_type.shape<2x3>}> : (tensor<*xi32>) -> tensor<2x3xi32>
 // CHECK-NEXT: return %[[ES]] : tensor<2x3xi32>
 }
 
@@ -62,9 +62,9 @@ func.func @my_pack_single_input(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
   %0 = "tf.MyPack"(%arg0) {N=1:i32, axis=0:i32} : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
   func.return %0 : tensor<3x2x3xf32>
 
-// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT: %[[ED:.*]] = "tf.ExpandDims"(%arg0, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ED]]) {shape = #tf_type.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ED]]) <{shape = #tf_type.shape<3x2x3>}> : (tensor<*xf32>) -> tensor<3x2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
 }
 
@@ -73,13 +73,13 @@ func.func @my_pack_multiple_inputs(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32
   %0 = "tf.MyPack"(%arg0, %arg1, %arg2) {N=3:i32, axis=0:i32} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<3x2x3xf32>
   func.return %0 : tensor<3x2x3xf32>
 
-// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT: %[[ED0:.*]] = "tf.ExpandDims"(%arg0, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[ED1:.*]] = "tf.ExpandDims"(%arg1, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[CC0:.*]] = "tf.RiscConcat"(%[[ED0]], %[[ED1]]) {axis = 0 : i32} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[ED2:.*]] = "tf.ExpandDims"(%arg2, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[CC1:.*]] = "tf.RiscConcat"(%[[CC0]], %[[ED2]]) {axis = 0 : i32} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CC1]]) {shape = #tf_type.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CC1]]) <{shape = #tf_type.shape<3x2x3>}> : (tensor<*xf32>) -> tensor<3x2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
 }
 
@@ -98,7 +98,7 @@ func.func @my_add_n_multiple_inputs(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf3
 
 // CHECK-NEXT: %[[ADD0:.*]] = "tf.RiscAdd"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
 // CHECK-NEXT: %[[ADD1:.*]] = "tf.RiscAdd"(%[[ADD0]], %arg2) : (tensor<*xf32>, tensor<2x3xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ADD1]]) {shape = #tf_type.shape<2x3>} : (tensor<*xf32>) -> tensor<2x3xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ADD1]]) <{shape = #tf_type.shape<2x3>}> : (tensor<*xf32>) -> tensor<2x3xf32>
 // CHECK-NEXT: return %[[ES]] : tensor<2x3xf32>
 }
 
@@ -112,10 +112,10 @@ func.func @my_map_and_batch_dataset(%input: tensor<*x!tf_type.variant>,
     : (tensor<*x!tf_type.variant>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.variant>
   func.return %0 : tensor<*x!tf_type.variant>
 
-// CHECK-DAG: %[[BATCH:.*]] = "tf.Const"() {value = dense<1000> : tensor<i64>} : () -> tensor<i64>
-// CHECK-DAG: %[[PARAL:.*]] = "tf.Const"() {value = dense<8> : tensor<i64>} : () -> tensor<i64>
-// CHECK-DAG: %[[KEEP:.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
-// CHECK: %[[CAST:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+// CHECK-DAG: %[[BATCH:.*]] = "tf.Const"() <{value = dense<1000> : tensor<i64>}> : () -> tensor<i64>
+// CHECK-DAG: %[[PARAL:.*]] = "tf.Const"() <{value = dense<8> : tensor<i64>}> : () -> tensor<i64>
+// CHECK-DAG: %[[KEEP:.*]] = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<i1>
+// CHECK: %[[CAST:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<*xi32>) -> tensor<*xf32>
 // CHECK: %[[RET:.*]] = "tf.MapAndBatchDatasetV0"(%arg0, %[[BATCH]], %[[PARAL]], %[[KEEP]], %arg1, %[[CAST]])
 // CHECK-SAME: {f = @__some_func, output_shapes = [#tf_type.shape<>], output_types = [f32], preserve_cardinality = true} : (tensor<*x!tf_type.variant>, tensor<i64>, tensor<i64>, tensor<i1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*x!tf_type.variant>
 // CHECK: return %[[RET]] : tensor<*x!tf_type.variant>
diff --git a/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir b/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir
index 4be59b531365f2..14d21277dce2f1 100644
--- a/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir
@@ -17,7 +17,7 @@ func.func @decompose_tf_same(%arg0: tensor<1x2x3x4x!tf_type.string>) -> tensor<1
   func.return %2 : tensor<1x2x3x4x!tf_type.string>
 
 // CHECK: %[[id:.*]] = "tf.RiscSame"(%arg0) : (tensor<1x2x3x4x!tf_type.string>) -> tensor<*x!tf_type.string>
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id]]) {shape = #tf_type.shape<1x2x3x4>} : (tensor<*x!tf_type.string>) -> tensor<1x2x3x4x!tf_type.string>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id]]) <{shape = #tf_type.shape<1x2x3x4>}> : (tensor<*x!tf_type.string>) -> tensor<1x2x3x4x!tf_type.string>
 // CHECK: return %[[es]] : tensor<1x2x3x4x!tf_type.string>
 }
 
@@ -32,7 +32,7 @@ func.func @decompose_tf_consecutive(%arg0: tensor<1x2x3x4x!tf_type.string>, %arg
 
 // CHECK: %[[id0:.*]] = "tf.RiscSame"(%arg0) : (tensor<1x2x3x4x!tf_type.string>) -> tensor<*x!tf_type.string>
 // CHECK: %[[id2:.*]] = "tf.RiscSame"(%arg2) : (tensor<f32>) -> tensor<*xf32>
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id2]]) {shape = #tf_type.shape<>} : (tensor<*xf32>) -> tensor<f32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id2]]) <{shape = #tf_type.shape<>}> : (tensor<*xf32>) -> tensor<f32>
 // CHECK: return %[[es]] : tensor<f32>
 }
 
@@ -47,7 +47,7 @@ func.func @decompose_tf_concat_n(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2:
   func.return %4 : tensor<3xf32>
 
 // CHECK: %[[concat:.*]] = "tf.RiscConcat"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[concat]]) {shape = #tf_type.shape<3>} : (tensor<*xf32>) -> tensor<3xf32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[concat]]) <{shape = #tf_type.shape<3>}> : (tensor<*xf32>) -> tensor<3xf32>
 // CHECK: return %[[es]] : tensor<3xf32>
 }
 
@@ -62,7 +62,7 @@ func.func @decompose_tf_split(%arg0: tensor<3xf32>) -> (tensor<f32>) {
   func.return %4 : tensor<f32>
 
 // CHECK: %[[split:.*]]:3 = "tf.RiscSplit"(%arg0) {N = 3 : i32} : (tensor<3xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[split]]#0) {shape = #tf_type.shape<>} : (tensor<*xf32>) -> tensor<f32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[split]]#0) <{shape = #tf_type.shape<>}> : (tensor<*xf32>) -> tensor<f32>
 // CHECK: return %[[es]] : tensor<f32>
 }
 
@@ -75,7 +75,7 @@ func.func @decompose_tf_cast(%arg0: tensor<f32>) -> tensor<i32> {
   func.return %4 : tensor<i32>
 
 // CHECK: %[[tfcast:.*]] = "tf.RiscCast"(%arg0) {K = i32} : (tensor<f32>) -> tensor<*xi32>
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[tfcast]]) {shape = #tf_type.shape<>} : (tensor<*xi32>) -> tensor<i32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[tfcast]]) <{shape = #tf_type.shape<>}> : (tensor<*xi32>) -> tensor<i32>
 // CHECK: return %[[es]] : tensor<i32>
 }
 
@@ -87,7 +87,7 @@ func.func @convert_to_scalar_tensor() -> tensor<f32> {
   %4 = "tfr.cast"(%cst) : (!tfr.tensor) -> tensor<f32>
   func.return %4 : tensor<f32>
 
-// CHECK: %[[cst:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK: %[[cst:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<f32>}> : () -> tensor<f32>
 // CHECK: return %[[cst]] : tensor<f32>
 }
 
@@ -100,7 +100,7 @@ func.func @attribute_propagate(%arg0: tensor<f32>) -> tensor<i32> {
   func.return %4 : tensor<i32>
 
 // CHECK: %[[tfcast:.*]] = "tf.RiscCast"(%arg0) {K = i32, _tpu_replicate, device = "hello"} : (tensor<f32>) -> tensor<*xi32>
-// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[tfcast]]) {shape = #tf_type.shape<>} : (tensor<*xi32>) -> tensor<i32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[tfcast]]) <{shape = #tf_type.shape<>}> : (tensor<*xi32>) -> tensor<i32>
 // CHECK: return %[[es]] : tensor<i32>
 }
 
@@ -111,7 +111,7 @@ func.func @fixed_element_attribute(%arg0: tensor<2xf32>) -> tensor<2xi1> {
   %2 = "tfr.cast"(%1) : (!tfr.tensor) -> tensor<2xi1>
   func.return %2 : tensor<2xi1>
 // CHECK: %[[positive:.*]] = "tf.Positive"(%arg0) : (tensor<2xf32>) -> tensor<*xi1>
-// CHECK: %[[res:.*]] = "tf.EnsureShape"(%[[positive]]) {shape = #tf_type.shape<2>} : (tensor<*xi1>) -> tensor<2xi1>
+// CHECK: %[[res:.*]] = "tf.EnsureShape"(%[[positive]]) <{shape = #tf_type.shape<2>}> : (tensor<*xi1>) -> tensor<2xi1>
 // CHECK: return %[[res]] : tensor<2xi1>
 }
 
diff --git a/tensorflow/compiler/mlir/tfr/tests/rewrite_quantized_io.mlir b/tensorflow/compiler/mlir/tfr/tests/rewrite_quantized_io.mlir
index af2f843144ce5f..05823bfad27005 100644
--- a/tensorflow/compiler/mlir/tfr/tests/rewrite_quantized_io.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/rewrite_quantized_io.mlir
@@ -22,8 +22,8 @@ func.func @remove_quantized_io(
   %1 = "tf.Intermediate"(%arg1) : (tensor<1x5xf32>) -> tensor<1x5xf32>
   func.return %0, %1 : tensor<1x10x!quant.uniform<i8:f32, 0.2:42>>, tensor<1x5xf32>
 
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<1.000000e-01> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() <{value = dense<1.000000e-01> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i32>}> : () -> tensor<i32>
 // CHECK: %[[quant:.*]] = "tfr.cast"(%arg0) : (tensor<1x10xi8>) -> !tfr.tensor
 // CHECK: %[[scale_cast:.*]] = "tfr.cast"(%[[scale]])
 // CHECK: %[[zp_cast:.*]] = "tfr.cast"(%[[zp]])
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 5a3626061d6050..50b19af44ecf52 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -368,6 +368,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_dialect_to_executor",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:function_body",
diff --git a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
index 823cbbc6f24cf7..75eff829e60952 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
@@ -11,7 +11,7 @@ func.func private @test_fuse_tpu_ops(%arg0: tensor<*xi32>, %arg1: tensor<*x!tf_t
   // CHECK-NOT: tf.TPUExecuteOp
 
   // CHECK-NEXT: %0 = "tf.ReadVariableOp"(%arg1)
-  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) <{metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"}> : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
 
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
@@ -38,8 +38,8 @@ func.func private @test_outside_compilation(%arg0: tensor<*xi32>, %arg1: tensor<
   // CHECK-NOT: tf.TPUExecuteOp
 
   // CHECK-NEXT: %0 = "tf.ReadVariableOp"(%arg1)
-  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
-  // CHECK-NEXT: "tf._XlaSendFromHost"(%arg0, %0, [[key]]) {_xla_has_host_transfer = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", device_ordinal = 0 : i64, key = "host_compute_channel_0_retvals"} : (tensor<*xi32>, tensor<*xi32>, tensor<3x!tf_type.string>) -> ()
+  // CHECK:      [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %0) <{metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 2, 0>, operands_with_static_shape = [], producer_name = "default"}> : (tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK-NEXT: "tf._XlaSendFromHost"(%arg0, %0, [[key]]) <{device_ordinal = 0 : i64, key = "host_compute_channel_0_retvals"}> {_xla_has_host_transfer = true, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*xi32>, tensor<*xi32>, tensor<3x!tf_type.string>) -> ()
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
   %1 = "tf.Shape"(%arg0) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
@@ -69,8 +69,8 @@ func.func private @test_fuse_dynamic_dimension_ops(%arg0: tensor<?x?xi32>, %arg1
   // CHECK: [[read_result:%.*]] = "tf.ReadVariableOp"(%arg1)
   // CHECK: [[shape_result_1:%.*]] = "tf.Shape"(%arg0) {device = "/CPU:0"} : (tensor<?x?xi32>) -> tensor<?xi64>
   // CHECK: [[shape_result_2:%.*]] = "tf.Shape"([[read_result]]) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
-  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, %0, %arg2, %arg4, %arg3) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 3>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<2xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
-  // CHECK: [[key_1:%.*]], [[exec_result_1:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1) {metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 0>, operands_with_static_shape = [], producer_name = "default"} : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, %0, %arg2, %arg4, %arg3) <{metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 3>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], producer_name = "default"}> : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<2xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK: [[key_1:%.*]], [[exec_result_1:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1) <{metadata = "metadata", mlir_module = "mlir_module", operandSegmentSizes = array<i32: 4, 0>, operands_with_static_shape = [], producer_name = "default"}> : (tensor<?x?xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
   %dyn_arg0 = "tf.SetStaticDimensionBounds" (%arg0, %arg2) :(tensor<?x?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
index 28c08a1a369f0e..e6d5aec8285a0b 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
@@ -5,16 +5,16 @@ module attributes {tf_saved_model.semantics} {
 // Test hoisting varhandle op.
 
 // CHECK-LABEL: func @_tfrt_resource_init
-// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "x"}> : () -> tensor<!tf_type.resource<tensor<i32>>>
 // CHECK: [[x:%.*]] = "tf.ReadVariableOp"([[handle]]) {device = "/CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-// CHECK: "tf._TfrtSetResource"([[x]]) {device = "/CPU:0", index = 0 : i64} : (tensor<i32>) -> ()
+// CHECK: "tf._TfrtSetResource"([[x]]) <{index = 0 : i64}> {device = "/CPU:0"} : (tensor<i32>) -> ()
 
 // CHECK-LABEL: func @test_hoist_varhandleop
 func.func @hoist_varhandleop(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_varhandleop"]} {
   // CHECK-NOT: tf.VarHandleOp
   // CHECK-NOT: tf.ReadVariableOp
-  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/CPU:0", indices = [0], shared_name = [""]} : () -> tensor<i32>
+  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() <{container = [""], indices = [0], shared_name = [""]}> {device = "/CPU:0"} : () -> tensor<i32>
   // CHECK: [[r:%.*]] = "tf.AddV2"({{.*}}, [[v]]) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return [[r]]
   %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
@@ -34,16 +34,16 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: func @_tfrt_resource_init
 // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
 // CHECK-SAME: shared_name = "x"
-// CHECK: "tf._TfrtSetResource"([[handle]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[handle_idx:.*]] : i64}
+// CHECK: "tf._TfrtSetResource"([[handle]]) <{index = [[handle_idx:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
 // CHECK: [[x:%.*]] = "tf.LookupTableSizeV2"([[handle]])
-// CHECK: "tf._TfrtSetResource"([[x]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[size_idx:.*]] : i64} : (tensor<i64>) -> ()
+// CHECK: "tf._TfrtSetResource"([[x]]) <{index = [[size_idx:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i64>) -> ()
 
 // CHECK: func @test_hoist_hash_table
 func.func @hoist_hash_table(%arg: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input"]}, %default: tensor<i64> {tf_saved_model.index_path = ["default"]}) -> (tensor<i64> {tf_saved_model.index_path = ["r"]}, tensor<*xi64> {tf_saved_model.index_path = ["r1"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_hash_table"]} {
   // CHECK-NOT: tf.HashTableV2
   // CHECK-NOT: tf.LookupTableSizeV2
-  // CHECK: [[v:%.*]]:2 = "tf._TfrtGetResource"() {container = ["", ""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [0, 1], shared_name = [{{.*}}, {{.*}}]}
+  // CHECK: [[v:%.*]]:2 = "tf._TfrtGetResource"() <{container = ["", ""], indices = [0, 1], shared_name = [{{.*}}, {{.*}}]}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
   // CHECK: [[r:%.*]] = "tf.LookupTableFindV2"([[v]]#[[handle_idx]]
   // CHECK: return [[v]]#[[size_idx]], [[r]]
   %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "x", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
@@ -61,17 +61,17 @@ module attributes {tf_saved_model.semantics} {
 // Test hoisting const op.
 
 // CHECK-LABEL: func @_tfrt_resource_init
-// CHECK: [[const:%.*]] = "tf.Const"() {device = "/CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK: [[const:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = "/CPU:0"} : () -> tensor<i32>
 // CHECK: [[x:%.*]] = "tf.AddV2"([[const]], [[const]]) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK: "tf._TfrtSetResource"([[x]]) {device = "/CPU:0", index = 0 : i64} : (tensor<i32>) -> ()
-// CHECK: [[const_1:%.*]] = "tf.Const"() {device = "/CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK: "tf._TfrtSetResource"([[const_1]]) {device = "/CPU:0", index = 1 : i64} : (tensor<i32>) -> ()
+// CHECK: "tf._TfrtSetResource"([[x]]) <{index = 0 : i64}> {device = "/CPU:0"} : (tensor<i32>) -> ()
+// CHECK: [[const_1:%.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> {device = "/CPU:0"} : () -> tensor<i32>
+// CHECK: "tf._TfrtSetResource"([[const_1]]) <{index = 1 : i64}> {device = "/CPU:0"} : (tensor<i32>) -> ()
 
 // CHECK-LABEL: func @test_hoist_const
 func.func @hoist_const(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_const"]} {
   // CHECK-NOT: tf.Const
-  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/CPU:0", indices = [0], shared_name = [""]} : () -> tensor<i32>
+  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() <{container = [""], indices = [0], shared_name = [""]}> {device = "/CPU:0"} : () -> tensor<i32>
   // CHECK-NEXT: "tf.AddV2"({{.*}}, [[v]]) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: return
   %const = "tf.Const"() {device = "/CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -84,7 +84,7 @@ func.func @hoist_const(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}
 func.func @hoist_const_return(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_const_return"]} {
   // CHECK-NOT: tf.Const
-  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/CPU:0", indices = [1], shared_name = [""]} : () -> tensor<i32>
+  // CHECK: [[v:%.*]] = "tf._TfrtGetResource"() <{container = [""], indices = [1], shared_name = [""]}> {device = "/CPU:0"} : () -> tensor<i32>
   // CHECK-NEXT: return [[v]]
   %const = "tf.Const"() {device = "/CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
   func.return %const : tensor<i32>
@@ -99,17 +99,17 @@ module attributes {tf_saved_model.semantics} {
 // Test hoisting write side-effect ops.
 
 // CHECK-LABEL: func @_tfrt_resource_init
-// CHECK: [[const:%.*]] = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: "tf._TfrtSetResource"([[const]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[const_idx:.*]] : i64} : (tensor<i32>) -> ()
-// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
-// CHECK: "tf._TfrtSetResource"([[handle]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[handle_idx:.*]] : i64} : (tensor<!tf_type.resource<tensor<i32>>>) -> ()
+// CHECK: [[const:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
+// CHECK: "tf._TfrtSetResource"([[const]]) <{index = [[const_idx:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> ()
+// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "x"}> : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: "tf._TfrtSetResource"([[handle]]) <{index = [[handle_idx:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>) -> ()
 
 // CHECK: func @test_hoist_var_read_write
 func.func @hoist_var_read_write() -> (tensor<i32> {tf_saved_model.index_path = ["x"]}, tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_var_read_write"]} {
   // CHECK-NOT: tf.Const
   // CHECK-NOT: tf.VarHandleOp
-  // CHECK: [[v:%.*]]:2 = "tf._TfrtGetResource"() {container = ["", ""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [0, 1], shared_name = [{{.*}}, {{.*}}]} : () -> ({{.*}})
+  // CHECK: [[v:%.*]]:2 = "tf._TfrtGetResource"() <{container = ["", ""], indices = [0, 1], shared_name = [{{.*}}, {{.*}}]}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ({{.*}})
   // CHECK: [[x:%.*]] = "tf.ReadVariableOp"([[v]]#[[handle_idx]]) {device = "/CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   // CHECK-NEXT: "tf.AssignVariable"([[v]]#[[handle_idx]], [[v]]#[[const_idx]]) {device = "/CPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
   // CHECK-NEXT: [[r:%.*]] = "tf.ReadVariableOp"([[v]]#[[handle_idx]]) {device = "/CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
@@ -131,13 +131,13 @@ module attributes {tf_saved_model.semantics} {
 // Test not hoisting read variable op that used by control flow ops if var handle op and read variable op are separated, but still hoists const ops and var handle ops.
 
 // CHECK-LABEL: func @_tfrt_resource_init
-// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "x"}> : () -> tensor<!tf_type.resource<tensor<i32>>>
 // CHECK: "tf._TfrtSetResource"([[handle]])
 // CHECK-SAME: index = [[handle_index:.*]]
-// CHECK: [[handle1:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: [[handle1:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "x"}> : () -> tensor<!tf_type.resource<tensor<i32>>>
 // CHECK: "tf._TfrtSetResource"([[handle1]])
 // CHECK-SAME: index = [[handle1_index:.*]]
-// CHECK: [[const:%.*]] = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
+// CHECK: [[const:%.*]] = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = "/CPU:0"} : () -> tensor<i1>
 // CHECK: "tf._TfrtSetResource"([[const]])
 // CHECK-SAME: index = [[const_index:.*]]
 func.func private @some_func(
@@ -164,7 +164,7 @@ func.func @not_hoist_if(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]
   attributes {tf_saved_model.exported_names = ["test_not_hoist_if"]} {
   %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
   // CHECK-NOT: tf.Const
-  // CHECK:  "tf._TfrtGetResource"() 
+  // CHECK:  "tf._TfrtGetResource"()
   %cond = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
   // CHECK: tf.If
   %x = "tf.If"(%cond, %handle) {then_branch = @some_func, else_branch = @some_func, is_stateless = false} : (tensor<i1>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
@@ -185,7 +185,7 @@ func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32>
   attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
   // CHECK-NOT: tf.VarHandleOp
   // CHECK-NOT: tf.ReadVariableOp
-  // CHECK:  "tf._TfrtGetResource"() 
+  // CHECK:  "tf._TfrtGetResource"()
   %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
   %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
   %2 = "tf.AddV2"(%arg0, %1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
@@ -197,7 +197,7 @@ func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32>
 func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
   attributes {tf_saved_model.exported_names = ["main"]} {
   // CHECK-NOT: tf.VarHandleOp
-  // CHECK:  "tf._TfrtGetResource"() 
+  // CHECK:  "tf._TfrtGetResource"()
   %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
   // CHECK: "tf.BatchFunction"(%arg0, %0)
   // CHECK: operandSegmentSizes = array<i32: 1, 1>
@@ -288,4 +288,4 @@ func.func private @func2(%arg: tensor<i1>) -> tensor<i32> {
   func.return %r : tensor<i32>
 }
 
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir
index 7b797b357a1539..7f82726b0740fa 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir
@@ -7,17 +7,17 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: func @_tfrt_resource_init
 // CHECK: [[handle:%.*]] = "tf.HashTableV2"()
 // CHECK-SAME: shared_name = "x"
-// CHECK: "tf._TfrtSetResource"([[handle]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[handle_id:.*]] : i64}
+// CHECK: "tf._TfrtSetResource"([[handle]]) <{index = [[handle_id:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
 // CHECK: [[x:%.*]] = "tf.LookupTableSizeV2"([[handle]])
-// CHECK: "tf._TfrtSetResource"([[x]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[size_id:.*]] : i64} : (tensor<i64>) -> ()
+// CHECK: "tf._TfrtSetResource"([[x]]) <{index = [[size_id:.*]] : i64}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i64>) -> ()
 
 // CHECK: func @test_hoist_hash_table
 func.func @hoist_hash_table(%arg: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input"]}, %default: tensor<i64> {tf_saved_model.index_path = ["default"]}) -> (tensor<i64> {tf_saved_model.index_path = ["r"]}, tensor<*xi64> {tf_saved_model.index_path = ["r1"]})
   attributes {tf_saved_model.exported_names = ["test_hoist_hash_table"]} {
   // CHECK-NOT: tf.HashTableV2
   // CHECK-NOT: tf.LookupTableSizeV2
-  // CHECK-DAG: [[v0:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [[[handle_id]]], shared_name = [{{.*}}]}
-  // CHECK-DAG: [[v1:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [[[size_id]]], shared_name = [{{.*}}]}
+  // CHECK-DAG: [[v0:%.*]] = "tf._TfrtGetResource"() <{container = [""], indices = [[[handle_id]]], shared_name = [{{.*}}]}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+  // CHECK-DAG: [[v1:%.*]] = "tf._TfrtGetResource"() <{container = [""], indices = [[[size_id]]], shared_name = [{{.*}}]}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
   // CHECK-DAG: [[r:%.*]] = "tf.LookupTableFindV2"([[v0]]
   // CHECK-DAG: return [[v1]], [[r]]
   %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "x", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/async_while.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/async_while.mlir
index b13096c9ea0fb2..d7fee9dee62358 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/async_while.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/async_while.mlir
@@ -22,11 +22,11 @@ func.func private @"map/while_body"(%loop_count: tensor<i32>, %max_iterations: t
 }
 
 // CHECK-LABEL: func.func private @"map/while_body/TfMlrtAsyncWhileBody"(%arg0: !mlrt.promise, %arg1: !mlrt.future, %arg2: !mlrt.promise, %arg3: !mlrt.future, %arg4: !mlrt.promise, %arg5: tensor<i32>, %arg6: tensor<?x!tf_type.resource>, %arg7: tensor<*xf32>) {
-// CHECK-NEXT:    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:    %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:    %0 = "tf_mlrt.tf_await"(%arg1) : (!mlrt.future) -> tensor<i32>
 // CHECK-NEXT:    %1 = "tf.AddV2"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg2, %1) : (!mlrt.promise, tensor<i32>) -> ()
-// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg5) {config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg5) <{config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg0, %2) : (!mlrt.promise, tensor<i1>) -> ()
 // CHECK-NEXT:    %3 = "tf.TensorArrayReadV3"(%arg6, %0, %arg7) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
 // CHECK-NEXT:    %4 = "tf_mlrt.tf_await"(%arg3) : (!mlrt.future) -> tensor<3x3xf32>
@@ -37,7 +37,7 @@ func.func private @"map/while_body"(%loop_count: tensor<i32>, %max_iterations: t
 //CHECK-LABEL: func.func @serving_default
 func.func @serving_default(%max_iterations: tensor<i32>, %array_handle: tensor<?x!tf_type.resource>, %array_flow: tensor<*xf32>, %matrix: tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<*xf32>) {
   %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) {config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) <{config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %1:6 = tf_mlrt.tf_async_while @"map/while_body/TfMlrtAsyncWhileBody"(%0, %cst, %arg3, %arg0, %arg1, %arg2) {invariant_size = 3 : i32} : (tensor<i1>, tensor<i32>, tensor<3x3xf32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>) -> (!mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future)
   %1:5 = "tf.While"(%cst_0, %max_iterations, %array_handle, %array_flow, %matrix) {body= @"map/while_body", cond = @"map/while_cond", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>) ->  (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>)
   // CHECK-NEXT: %2 = "tf_mlrt.tf_await"(%1#5) : (!mlrt.future) -> tensor<*xf32>
@@ -50,10 +50,10 @@ func.func @serving_default(%max_iterations: tensor<i32>, %array_handle: tensor<?
 //CHECK-LABEL: func.func @multi_while_test
 func.func @multi_while_test(%max_iterations: tensor<i32>, %array_handle: tensor<?x!tf_type.resource>, %array_flow: tensor<*xf32>, %matrix: tensor<3x3xf32>, %array_handle_2: tensor<?x!tf_type.resource>, %array_flow_2: tensor<*xf32>, %matrix_2: tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<3x3xf32>) {
   %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) {config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) <{config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %1:6 = tf_mlrt.tf_async_while @"map/while_body/TfMlrtAsyncWhileBody"(%0, %cst, %arg3, %arg0, %arg1, %arg2) {invariant_size = 3 : i32} : (tensor<i1>, tensor<i32>, tensor<3x3xf32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>) -> (!mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future)
   %1:5 = "tf.While"(%cst_0, %max_iterations, %array_handle, %array_flow, %matrix) {body= @"map/while_body", cond = @"map/while_cond", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>) ->  (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>)
-  // CHECK: %2 = "tf.PartitionedCall"(%cst, %arg0) {config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK: %2 = "tf.PartitionedCall"(%cst, %arg0) <{config = "", config_proto = "", executor_type = "", f = @"map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %3:6 = tf_mlrt.tf_async_while @"map/while_body/TfMlrtAsyncWhileBody"(%2, %cst, %arg6, %arg0, %arg4, %arg5) {invariant_size = 3 : i32} : (tensor<i1>, tensor<i32>, tensor<3x3xf32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>) -> (!mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future)
   %2:5 = "tf.While"(%cst_0, %max_iterations, %array_handle_2, %array_flow_2, %matrix_2) {body= @"map/while_body", cond = @"map/while_cond", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>) ->  (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>)
   // CHECK-NEXT: %4 = "tf_mlrt.tf_await"(%1#2) : (!mlrt.future) -> tensor<3x3xf32>
@@ -128,11 +128,11 @@ func.func private @"random/while_body"(%loop_count: tensor<i32>, %max_iterations
 }
 
 // CHECK-LABEL: func.func private @"random/while_body/TfMlrtAsyncWhileBody_1"(%arg0: !mlrt.promise, %arg1: !mlrt.future, %arg2: !mlrt.promise, %arg3: !mlrt.future, %arg4: !mlrt.promise, %arg5: tensor<i32>, %arg6: tensor<?x!tf_type.resource>, %arg7: tensor<*xf32>) {
-// CHECK-NEXT:    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:    %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:    %0 = "tf_mlrt.tf_await"(%arg1) : (!mlrt.future) -> tensor<i32>
 // CHECK-NEXT:    %1 = "tf.AddV2"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg2, %1) : (!mlrt.promise, tensor<i32>) -> ()
-// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg5) {config = "", config_proto = "", executor_type = "", f = @"random/while_cond/TfMlrtAsyncWhilePredicate_0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg5) <{config = "", config_proto = "", executor_type = "", f = @"random/while_cond/TfMlrtAsyncWhilePredicate_0"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg0, %2) : (!mlrt.promise, tensor<i1>) -> ()
 // CHECK-NEXT:    %3 = "tf.TensorArrayReadV3"(%arg6, %0, %arg7) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
 // CHECK-NEXT:    %4 = "tf_mlrt.tf_await"(%arg3) : (!mlrt.future) -> tensor<3x3xf32>
@@ -143,7 +143,7 @@ func.func private @"random/while_body"(%loop_count: tensor<i32>, %max_iterations
 //CHECK-LABEL: func.func @random_serving_default
 func.func @random_serving_default(%max_iterations: tensor<i32>, %array_handle: tensor<?x!tf_type.resource>, %array_flow: tensor<*xf32>, %matrix: tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<*xf32>) {
   %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) {config = "", config_proto = "", executor_type = "", f = @"random/while_cond/TfMlrtAsyncWhilePredicate_0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) <{config = "", config_proto = "", executor_type = "", f = @"random/while_cond/TfMlrtAsyncWhilePredicate_0"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %1:6 = tf_mlrt.tf_async_while @"random/while_body/TfMlrtAsyncWhileBody_1"(%0, %cst, %arg3, %arg0, %arg1, %arg2) {invariant_size = 3 : i32} : (tensor<i1>, tensor<i32>, tensor<3x3xf32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>) -> (!mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future)
   %1:5 = "tf.While"(%cst_0, %max_iterations, %array_handle, %array_flow, %matrix) {body= @"random/while_body", cond = @"random/while_cond", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>) ->  (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>)
   // CHECK-NEXT: %2 = "tf_mlrt.tf_await"(%1#5) : (!mlrt.future) -> tensor<*xf32>
@@ -166,7 +166,7 @@ func.func private @"sort_map/while_cond"(%loop_count: tensor<i32>, %max_iteratio
 // CHECK-NEXT:    return %0 : tensor<i1>
 
 // CHECK-LABEL: func.func private @"sort_map/while_body"
-// CHECK-NEXT:    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:    %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:    %0 = "tf.AddV2"(%arg0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK-NEXT:    %1 = "tf.TensorArrayReadV3"(%arg2, %arg0, %arg3) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
 // CHECK-NEXT:    %2 = "tf.TensorArrayReadV3"(%arg5, %arg0, %arg6) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
@@ -177,7 +177,7 @@ func.func private @"sort_map/while_cond"(%loop_count: tensor<i32>, %max_iteratio
 // CHECK-NEXT:   %7 = "tf.Identity"(%4) : (tensor<i1>) -> tensor<i1>
 // CHECK-NEXT:    %8 = "tf.MatMul"(%5, %arg7) : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
 // CHECK-NEXT:    %9 = "tf.Select"(%7, %8, %arg7) : (tensor<i1>, tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-// CHECK-NEXT:    return %0, %arg1, %arg2, %arg3, %6, %arg5, %arg6, %9, %arg8 
+// CHECK-NEXT:    return %0, %arg1, %arg2, %arg3, %6, %arg5, %arg6, %9, %arg8
 func.func private @"sort_map/while_body"(%loop_count: tensor<i32>, %max_iterations: tensor<i32>, %handle: tensor<?x!tf_type.resource>, %flow_in: tensor<*xf32>, %matrix: tensor<3x3xf32>, %handle_2: tensor<?x!tf_type.resource>, %flow_in_2: tensor<*xf32>, %matrix_2: tensor<3x3xf32>, %bound: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>, tensor<i32>) {
   %cst_1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %updated_loop_count = "tf.AddV2"(%loop_count, %cst_1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -194,11 +194,11 @@ func.func private @"sort_map/while_body"(%loop_count: tensor<i32>, %max_iteratio
 }
 
 // CHECK-LABEL: func.func private @"sort_map/while_body/TfMlrtAsyncWhileBody"
-// CHECK-NEXT:    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:    %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:    %0 = "tf_mlrt.tf_await"(%arg1) : (!mlrt.future) -> tensor<i32>
 // CHECK-NEXT:    %1 = "tf.AddV2"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg2, %1) : (!mlrt.promise, tensor<i32>) -> ()
-// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg7) {config = "", config_proto = "", executor_type = "", f = @"sort_map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK-NEXT:    %2 = "tf.PartitionedCall"(%1, %arg7) <{config = "", config_proto = "", executor_type = "", f = @"sort_map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK-NEXT:    "tf_mlrt.tf_promise"(%arg0, %2) : (!mlrt.promise, tensor<i1>) -> ()
 // CHECK-NEXT:    %3 = "tf.TensorArrayReadV3"(%arg8, %0, %arg9) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
 // CHECK-NEXT:    %4 = "tf.TensorArrayReadV3"(%arg10, %0, %arg11) : (tensor<?x!tf_type.resource>, tensor<i32>, tensor<*xf32>) -> tensor<3x3xf32>
@@ -218,7 +218,7 @@ func.func private @"sort_map/while_body"(%loop_count: tensor<i32>, %max_iteratio
 //CHECK-LABEL: func.func @sort_serving_default
 func.func @sort_serving_default(%max_iterations: tensor<i32>, %array_handle: tensor<?x!tf_type.resource>, %array_flow: tensor<*xf32>, %matrix: tensor<3x3xf32>, %bound: tensor<i32>) -> (tensor<3x3xf32>, tensor<3x3xf32>, tensor<*xf32>) {
   %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) {config = "", config_proto = "", executor_type = "", f = @"sort_map/while_cond/TfMlrtAsyncWhilePredicate"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK: %0 = "tf.PartitionedCall"(%cst, %arg0) <{config = "", config_proto = "", executor_type = "", f = @"sort_map/while_cond/TfMlrtAsyncWhilePredicate"}> : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %1:10 = tf_mlrt.tf_async_while @"sort_map/while_body/TfMlrtAsyncWhileBody"(%0, %cst, %arg3, %arg3, %arg0, %arg1, %arg2, %arg1, %arg2, %arg4) {invariant_size = 6 : i32} : (tensor<i1>, tensor<i32>, tensor<3x3xf32>, tensor<3x3xf32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<i32>) -> (!mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future, !mlrt.future)
   %1:9 = "tf.While"(%cst_0, %max_iterations, %array_handle, %array_flow, %matrix , %array_handle, %array_flow, %matrix, %bound) {body= @"sort_map/while_body", cond = @"sort_map/while_cond", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>,tensor<i32>) ->  (tensor<i32>, tensor<i32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>, tensor<?x!tf_type.resource>, tensor<*xf32>, tensor<3x3xf32>, tensor<i32>)
   // CHECK-NEXT:  %2 = "tf_mlrt.tf_await"(%1#6) : (!mlrt.future) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
index 27c92289a5bddf..2b1f5fc9b17a4e 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
@@ -37,7 +37,7 @@ func.func private @"map/while_body"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg
 // CHECK-SAME: (%arg0: !mlrt.future, %arg1: !mlrt.promise, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?xf32>)
 // CHECK: [[det:%.*]] = "tf.MatrixDeterminant"
 // CHECK-NEXT: [[ta_0:%.*]] = "tf_mlrt.tf_await"(%arg0) : (!mlrt.future) -> tensor<!tf_type.variant<tensor<*xf32>>>
-// CHECK-NEXT: [[ta_1:%.*]] = "tf.TensorListSetItem"([[ta_0]], %arg3, [[det]]) {
+// CHECK-NEXT: [[ta_1:%.*]] = "tf.TensorListSetItem"([[ta_0]], %arg3, [[det]]) <{
 // CHECK-NEXT:  "tf_mlrt.tf_promise"(%arg1, [[ta_1]]) : (!mlrt.promise, tensor<!tf_type.variant<tensor<*xf32>>>) -> ()
 // CHECK-NEXT: return
 
@@ -53,7 +53,7 @@ func.func @serving_default(%arg0: tensor<?xf32> {tf.device = "/job:localhost/rep
   // CHECK-SAME: {body_fn = @"map/while_body/MapFnBody", num_tensor_list_or_flow_in = 1 : i32}
   // CHECK-NOT: tf.While
   %1:4 = "tf.While"(%cst, %cst, %0, %arg0) {_lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @"map/while_body", cond = @"map/while_cond", device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
-  // CHECK-NEXT: "tf.TensorListStack"([[map_fn_result]], %cst_0) {
+  // CHECK-NEXT: "tf.TensorListStack"([[map_fn_result]], %cst_0) <{
   %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 3 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<0xi32>) -> tensor<3xf32>
   return %2 : tensor<3xf32>
 }
@@ -458,7 +458,7 @@ func.func private @tf.WhileRegion2_body(%arg0: tensor<*xi32>) -> (tensor<?x!tf_t
   %5 = "tf.TensorArrayGatherV3"(%handle_12, %1, %4#2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<f32>) -> tensor<?x!tf_type.variant>
   // CHECK: TensorArrayGatherV3
   %6 = "tf.TensorArrayGatherV3"(%handle_14, %2, %4#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<f32>) -> tensor<?x!tf_type.variant>
-  return %5, %6 : tensor<?x!tf_type.variant>, tensor<?x!tf_type.variant> 
+  return %5, %6 : tensor<?x!tf_type.variant>, tensor<?x!tf_type.variant>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
index 2b0344cd7202a2..42e2e7ccb5086a 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
@@ -93,8 +93,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: func private @batched_function
 func.func private @batched_function(%arg0: tensor<!tf_type.resource<tensor<1x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
   attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
-  // CHECK-DAG: [[handle1:%.*]] = "tf.VarHandleOp"() {{{.*}}, shared_name = "variable1"}
-  // CHECK-DAG: [[handle2:%.*]] = "tf.VarHandleOp"() {{{.*}}, shared_name = "variable2"}
+  // CHECK-DAG: [[handle1:%.*]] = "tf.VarHandleOp"() <{{{.*}}, shared_name = "variable1"}>
+  // CHECK-DAG: [[handle2:%.*]] = "tf.VarHandleOp"() <{{{.*}}, shared_name = "variable2"}>
   // CHECK: "tf.ReadVariableOp"([[handle1]])
   // CHECK: "tf.ReadVariableOp"([[handle2]])
   %0 = "tf.ReadVariableOp"(%arg0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
@@ -298,7 +298,7 @@ func.func private @nested_batched_function(%arg0: tensor<1x3xf32>, %arg1: tensor
 }
 
 // CHECK-LABEL: func @main
-func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["main"]} {
   // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
   %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/merge_tf_if_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/merge_tf_if_ops.mlir
index f79c31963b3693..cb1990762f0b20 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/merge_tf_if_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/merge_tf_if_ops.mlir
@@ -51,8 +51,8 @@ func.func @nested_if_op_else_1(%cond: tensor<i1>, %x: tensor<i32>, %y: tensor<i3
 
 // CHECK-LABEL: func private @merge_stateless_merged_if_0_0_else
 // CHECK-SAME: ([[x:%.*]]: tensor<i32>, [[y:%.*]]: tensor<i32>)
-// CHECK-DAG: [[cst:%.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
-// CHECK-DAG: [[cst_0:%.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
+// CHECK-DAG: [[cst:%.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
+// CHECK-DAG: [[cst_0:%.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}>
 // CHECK: [[r0:%.*]] = "tf.AddV2"([[x]], [[cst]])
 // CHECK: [[r1:%.*]] = "tf.AddV2"([[y]], [[r0]])
 // CHECK: [[r2:%.*]] = "tf.AddV2"([[x]], [[cst_0]])
@@ -63,7 +63,7 @@ func.func @nested_if_op_else_1(%cond: tensor<i1>, %x: tensor<i32>, %y: tensor<i3
 // CHECK-SAME: ([[x:%.*]]: tensor<i32>, [[y:%.*]]: tensor<i32>, [[cond:%.*]]: tensor<i1>)
 func.func @merge_stateless(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
   // CHECK-NEXT: [[res:%.*]]:3 = "tf.If"([[cond]], [[x]], [[y]])
-  // CHECK-SAME: {else_branch = @merge_stateless_merged_if_0_0_else, is_stateless = true, then_branch = @merge_stateless_merged_if_0_0_then}
+  // CHECK-SAME: <{else_branch = @merge_stateless_merged_if_0_0_else, is_stateless = true, then_branch = @merge_stateless_merged_if_0_0_then}>
   // CHECK-SAME: (tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
   // CHECK-NEXT: return [[res]]#0, [[res]]#1, [[res]]#2
   %0, %1 = "tf.If"(%cond, %x, %y) {else_branch = @no_side_effect_else_0, then_branch = @no_side_effect_then_0, is_stateless = true} : (tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
@@ -83,8 +83,8 @@ func.func @merge_stateless(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>)
 
 // CHECK-LABEL: func private @merge_nested_if_op_merged_if_0_0_else_merged_if_1_0_else
 // CHECK-SAME: ([[x:%.*]]: tensor<i32>, [[y:%.*]]: tensor<i32>)
-// CHECK-NEXT: [[cst:%.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
-// CHECK-NEXT: [[cst_0:%.*]] = "tf.Const"() {value = dense<1> : tensor<i32>}
+// CHECK-NEXT: [[cst:%.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}>
+// CHECK-NEXT: [[cst_0:%.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}>
 // CHECK-NEXT: [[r0:%.*]] = "tf.AddV2"([[x]], [[cst_0]])
 // CHECK-NEXT: [[r1:%.*]] = "tf.AddV2"([[y]], [[r0]])
 // CHECK-NEXT: [[r2:%.*]] = "tf.AddV2"([[x]], [[cst]])
@@ -93,14 +93,14 @@ func.func @merge_stateless(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>)
 
 // CHECK-LABEL: func private @merge_nested_if_op_merged_if_0_0_else
 // CHECK-SAME: ([[cond:%.*]]: tensor<i1>, [[x:%.*]]: tensor<i32>, [[y:%.*]]: tensor<i32>)
-// CHECK-NEXT: [[r0:%.*]]:3 = "tf.If"(%arg0, %arg1, %arg2) {else_branch = @merge_nested_if_op_merged_if_0_0_else_merged_if_1_0_else, is_stateless = true, then_branch = @merge_nested_if_op_merged_if_0_0_else_merged_if_1_0_then} : (tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+// CHECK-NEXT: [[r0:%.*]]:3 = "tf.If"(%arg0, %arg1, %arg2) <{else_branch = @merge_nested_if_op_merged_if_0_0_else_merged_if_1_0_else, is_stateless = true, then_branch = @merge_nested_if_op_merged_if_0_0_else_merged_if_1_0_then}> : (tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
 // CHECK-NEXT: return [[r0]]#0, [[r0]]#1, [[r0]]#2
 
 // CHECK-LABEL: func @merge_nested_if_op
 // CHECK-SAME: ([[x:%.*]]: tensor<i32>, [[y:%.*]]: tensor<i32>, [[cond:%.*]]: tensor<i1>, [[nested_cond:%.*]]: tensor<i1>)
 func.func @merge_nested_if_op(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>, %nested_cond: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
   // CHECK-NEXT: [[res:%.*]]:3 = "tf.If"([[cond]], [[nested_cond]], [[x]], [[y]])
-  // CHECK-SAME: {else_branch = @merge_nested_if_op_merged_if_0_0_else, is_stateless = true, then_branch = @merge_nested_if_op_merged_if_0_0_then}
+  // CHECK-SAME: <{else_branch = @merge_nested_if_op_merged_if_0_0_else, is_stateless = true, then_branch = @merge_nested_if_op_merged_if_0_0_then}>
   // CHECK-SAME: (tensor<i1>, tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
   // CHECK-NEXT: return [[res]]#0, [[res]]#1, [[res]]#2
   %0, %1 = "tf.If"(%cond, %nested_cond, %x, %y) {else_branch = @nested_if_op_else_0, then_branch = @nested_if_op_then_0, is_stateless = true} : (tensor<i1>, tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
@@ -122,7 +122,7 @@ func.func @merge_side_effect(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>
 // CHECK-LABEL: func @multiple_uses
 func.func @multiple_uses(%x: tensor<i32>, %y: tensor<i32>, %cond: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
   // CHECK-NEXT: tf.If
-  // CHECK-SAME: {else_branch = @multiple_uses_merged_if_0_0_else, is_stateless = true, then_branch = @multiple_uses_merged_if_0_0_then}
+  // CHECK-SAME: <{else_branch = @multiple_uses_merged_if_0_0_else, is_stateless = true, then_branch = @multiple_uses_merged_if_0_0_then}>
   %0, %1 = "tf.If"(%cond, %x, %y) {else_branch = @no_side_effect_else_0, then_branch = @no_side_effect_then_0, is_stateless = true} : (tensor<i1>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
   %2 = "tf.If"(%cond, %x, %y) {else_branch = @no_side_effect_else_1, then_branch = @no_side_effect_then_1, is_stateless = true} : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0, %1, %2 : tensor<i32>, tensor<i32>, tensor<i32>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
index 28072642a22746..6e04fe1c1e23a1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -49,10 +51,10 @@ namespace tensorflow {
 namespace mlrt_compiler {
 
 StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
-    const TfrtCompileOptions& options,
-    const tfrt_stub::FallbackState& fallback_state, mlir::ModuleOp module,
-    tfrt_stub::ModelRuntimeContext& model_context,
-    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) {
+    const TfrtCompileOptions& options, tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module, tfrt_stub::ModelRuntimeContext& model_context,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys,
+    std::vector<std::string>* added_xla_function_names) {
   mlrt::bc::Buffer bytecode_buffer;
   TF_RETURN_IF_ERROR(ConvertTfMlirToRuntimeExecutable(
       options, module,
@@ -127,7 +129,7 @@ StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
         bytecode_buffer = std::move(*statusor);
         return OkStatus();
       },
-      model_context));
+      model_context, &fallback_state, added_xla_function_names));
   return bytecode_buffer;
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
index 87dc685694235a..ef9caeb47337a2 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
@@ -33,10 +33,10 @@ namespace mlrt_compiler {
 //
 // This is for initial conversion.
 StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
-    const TfrtCompileOptions& options,
-    const tfrt_stub::FallbackState& fallback_state, mlir::ModuleOp module,
-    tfrt_stub::ModelRuntimeContext& model_context,
-    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys = nullptr);
+    const TfrtCompileOptions& options, tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module, tfrt_stub::ModelRuntimeContext& model_context,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys = nullptr,
+    std::vector<std::string>* added_xla_function_names = nullptr);
 
 // Converts an MLIR `module_with_op_keys` in TF dialect to MLRT's bytecode
 // format, with op costs from `cost_recorder`.
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 1b933933de6673..9a8d41a4db7d03 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/platform/env.h"
@@ -233,7 +234,17 @@ Status ConvertTfMlirToRuntimeExecutable(
           "Failed to process TPUPartitionedCallOp for fallback execution"));
     }
   } else if (options.device_target == TfrtDeviceInfraTarget::kGpu) {
-    TF_RETURN_IF_ERROR(mlir::TF::RunTFXLABridge(module));
+    TF_RETURN_IF_ERROR(
+        tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
+            module, tf2xla::v2::DeviceType::XLA_GPU_JIT,
+            /*is_in_fallback_enabled_mode=*/false));
+
+    TF_RETURN_IF_ERROR(
+        tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
+            module, tsl::DeviceType(DEVICE_GPU_XLA_JIT)));
+
+    TF_RETURN_IF_ERROR(
+        tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(module));
 
     if (options.serialize_mlir_module_to_aot_packages) {
       const std::string mlir_string = SerializeMlirModule(module);
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index d46ab642bc81b1..ea31b4c072e607 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -92,12 +92,12 @@ def f(x):
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
 
-  def test_basic_with_token(self):
+  def test_basic_with_token_v8(self):
     x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
 
     def f(x):
       # sin(cos(x))
-      module, version = serialize("""
+      module, _ = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: !stablehlo.token, %arg1: tensor<3xf32>) -> (!stablehlo.token, tensor<3xf32>) {
     %0 = stablehlo.cosine %arg1 : tensor<3xf32>
@@ -105,6 +105,32 @@ def f(x):
     return %arg0, %1 : !stablehlo.token, tensor<3xf32>
   }
 }
+""")
+      return xla.call_module(
+          [x],
+          version=8,  # Version 8 uses only one prefix token
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+          has_token_input_output=True,  # Version 8 cares about this
+          platforms=[self.testing_platform()],
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
+
+  def test_basic_with_multiple_tokens(self):
+    x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+
+    def f(x):
+      # sin(cos(x))
+      module, version = serialize("""
+module @jit_f.0 {
+  func.func public @main(%arg0: !stablehlo.token {jax.token = true}, %arg1: !stablehlo.token {jax.token = true}, %arg2: tensor<3xf32>) -> (!stablehlo.token, !stablehlo.token, tensor<3xf32>) {
+    %0 = stablehlo.cosine %arg2 : tensor<3xf32>
+    %1 = stablehlo.sine %0 : tensor<3xf32>
+    return %arg0, %arg1, %1 : !stablehlo.token, !stablehlo.token, tensor<3xf32>
+  }
+}
 """)
       return xla.call_module(
           [x],
@@ -112,7 +138,31 @@ def f(x):
           module=module,
           Tout=[x.dtype],
           Sout=[x.shape],
-          has_token_input_output=True,
+          platforms=[self.testing_platform()],
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
+
+  def test_basic_with_tokens_preceeded_by_other_args(self):
+    x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+
+    def f(x):
+      # sin(cos(x))
+      module, version = serialize("""
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<i32>, %arg1: !stablehlo.token {jax.token = true}, %arg2: !stablehlo.token {jax.token = true}, %arg3: tensor<3xf32>) -> (!stablehlo.token, !stablehlo.token, tensor<3xf32>) {
+    %0 = stablehlo.cosine %arg3 : tensor<3xf32>
+    %1 = stablehlo.sine %0 : tensor<3xf32>
+    return %arg1, %arg2, %1 : !stablehlo.token, !stablehlo.token, tensor<3xf32>
+  }
+}
+""")
+      return xla.call_module(
+          [np.int32(0), x],
+          version=version,
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
           platforms=[self.testing_platform()],
       )
 
@@ -183,7 +233,7 @@ def f(x):  # x: f32[2, b]
     %0, %1 = call @dyn_main(%arg0_new, %arg1) : (tensor<{dim_var_type}>, tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<{dim_var_type}>)
     return %0, %1 : tensor<2x?xf32>, tensor<{dim_var_type}>
   }}
-  func.func private @dyn_main(%arg0: tensor<{dim_var_type}>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<{dim_var_type}>) {{
+  func.func private @dyn_main(%arg0: tensor<{dim_var_type}> {{jax.global_constant = "b"}}, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<{dim_var_type}>) {{
     %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
     return %0, %arg0 : tensor<2x?xf32>, tensor<{dim_var_type}>
   }}
@@ -278,7 +328,7 @@ def test_platforms_basic(self, *, platform_idx_type: str):
     #  returns x + 2. on CPU, x + 3. on GPU (CUDA or ROCM) and x + 4. on TPU
     module, version = serialize(f"""
 module @jit_f.0 {{
-  func.func public @main(%arg_platform_idx: tensor<{platform_idx_type}>, %arg0: tensor<f32>) -> tensor<f32> {{
+  func.func public @main(%arg_platform_idx: tensor<{platform_idx_type}> {{jax.global_constant = "_platform_index"}}, %arg0: tensor<f32>) -> tensor<f32> {{
     %0 = stablehlo.convert %arg_platform_idx : (tensor<{platform_idx_type}>) -> tensor<i32>
     %to_add = "stablehlo.case"(%0) ({{
       %cpu_val = stablehlo.constant dense<2.> : tensor<f32>
@@ -319,7 +369,7 @@ def test_platforms_unknown_custom_call(self):
     #  returns x + 2. on CPU, x + 3. on GPU, and x + 4. on TPU
     module, version = serialize("""
 module @jit_f.0 {
-  func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<f32>) -> tensor<f32> {
+  func.func public @main(%arg_platform_idx: tensor<i32> {jax.global_constant = "_platform_index"}, %arg0: tensor<f32>) -> tensor<f32> {
     %to_add = "stablehlo.case"(%arg_platform_idx) ({
       %cpu_val = stablehlo.constant dense<2.> : tensor<f32>
       stablehlo.return %cpu_val : tensor<f32>
@@ -358,13 +408,13 @@ def test_platforms_and_poly(self):
 
     module, version = serialize("""
 module @jit_f_jax attributes {jax.uses_shape_polymorphism = true} {
-  func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<?xf32>) -> (tensor<?xf32>) {
+  func.func public @main(%arg_platform_idx: tensor<i32> {jax.global_constant = "_platform_index"}, %arg0: tensor<?xf32>) -> (tensor<?xf32>) {
     %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?xf32>) -> tensor<i32>
     %5 = call @_wrapped_jax_export_main(%arg_platform_idx, %0, %arg0) : (tensor<i32>, tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
     return %5 : tensor<?xf32>
   }
 
-  func.func private @_wrapped_jax_export_main(%arg_platform_idx: tensor<i32>, %arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>) {
+  func.func private @_wrapped_jax_export_main(%arg_platform_idx: tensor<i32> {jax.global_constant = "_platform_index"}, %arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xf32>) -> (tensor<?xf32>) {
     %to_add = "stablehlo.case"(%arg_platform_idx) ({
       %cpu_val = stablehlo.constant dense<2.> : tensor<f32>
       stablehlo.return %cpu_val : tensor<f32>
@@ -395,6 +445,52 @@ def f(x):
     )
     self._assertOpOutputMatchesExpected(f, (x,), (expected_value,))
 
+  
+  def test_platforms_and_poly_and_tokens(self):
+    if test.is_built_with_rocm():
+      self.skipTest('Currently failing on ROCm due to mismatch')
+    x = np.arange(6, dtype=np.float32)
+    #  returns x + 2. on CPU, x + 3. on GPU (CUDA or ROCM) and x + 4. on TPU
+
+    module, version = serialize("""
+module @jit_f_jax attributes {jax.uses_shape_polymorphism = true} {
+  func.func public @main(%arg_platform_idx: tensor<i32> {jax.global_constant = "_platform_index"}, %arg_tok: !stablehlo.token {jax.token = true}, %arg0: tensor<?xf32>) -> (!stablehlo.token, tensor<?xf32>) {
+    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?xf32>) -> tensor<i32>
+    %5:2 = call @_wrapped_jax_export_main(%arg_platform_idx, %0, %arg_tok, %arg0) : (tensor<i32>, tensor<i32>, !stablehlo.token, tensor<?xf32>) -> (!stablehlo.token, tensor<?xf32>)
+    return %5#0, %5#1 : !stablehlo.token, tensor<?xf32>
+  }
+
+  func.func private @_wrapped_jax_export_main(%arg_platform_idx: tensor<i32> {jax.global_constant = "_platform_index"}, %arg0: tensor<i32> {jax.global_constant = "b"}, %arg_tok: !stablehlo.token {jax.token = true}, %arg1: tensor<?xf32>) -> (!stablehlo.token, tensor<?xf32>) {
+    %to_add = "stablehlo.case"(%arg_platform_idx) ({
+      %cpu_val = stablehlo.constant dense<2.> : tensor<f32>
+      stablehlo.return %cpu_val : tensor<f32>
+    }, {
+      %gpu_val = stablehlo.constant dense<3.> : tensor<f32>
+      stablehlo.return %gpu_val : tensor<f32>
+    }, {
+      %tpu_val = stablehlo.constant dense<4.> : tensor<f32>
+      stablehlo.return %tpu_val : tensor<f32>
+    }) : (tensor<i32>) -> tensor<f32>
+    %1 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.dynamic_broadcast_in_dim %to_add, %1, dims = [] : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+    %4 = stablehlo.add %3, %arg1 : tensor<?xf32>
+    return %arg_tok, %4 : !stablehlo.token, tensor<?xf32>
+  }
+}
+""")
+    platforms = ['CPU', 'CUDA', 'ROCM', 'TPU']
+    def f(x):
+      return xla.call_module([x], version=version,
+                             module=module,
+                             Tout=[np.float32],
+                             Sout=[()],
+                             platforms=platforms)
+
+    expected_value = (
+        x + dict(CPU=2.0, CUDA=3.0, ROCM=3.0, TPU=4.0)[self.testing_platform()]
+    )
+    self._assertOpOutputMatchesExpected(f, (x,), (expected_value,))
+
   # A module used for testing errors related to use of "platforms".
   platforms_errors_module_str = """
   module @jit_f.0 {
@@ -403,7 +499,7 @@ def f(x):
     }
   }
 """
-
+ 
   def platforms_errors_helper(
       self,
       *,
@@ -742,7 +838,7 @@ def f(x):  # x: f32[b, 5]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x5xi32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
     %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %1 = "stablehlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
     return %1 : tensor<?xi32>
@@ -790,7 +886,7 @@ def f(x):  # x: f32[b, 3]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x3xf32>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
     %0 = stablehlo.constant dense<3> : tensor<i32>
     %1 = stablehlo.multiply %arg0, %0 : tensor<i32>
     %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
@@ -819,7 +915,7 @@ def f(x):  # x: f32[b, 4]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x4xf32>) -> tensor<?x2xf32>
     return %0 : tensor<?x2xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i64>
     %1 = stablehlo.constant dense<0> : tensor<1xi64>
     %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
@@ -850,7 +946,7 @@ def f(x):  # x: f32[b, 4]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x4xf32>) -> tensor<4xf32>
     return %0 : tensor<4xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
     %0 = stablehlo.constant dense<-1> : tensor<i32>
     %1 = stablehlo.add %arg0, %0 : tensor<i32>
     %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
@@ -887,7 +983,7 @@ def f(x, idx):  # x: f32[b, 4]  idx: i32
     %0 = call @dyn_main(%arg0_new, %arg1, %arg2) : (tensor<i32>, tensor<?x4xf32>, tensor<i32>) -> tensor<?x4xf32>
     return %0 : tensor<?x4xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
     %1 = stablehlo.compare  LT, %arg2, %0,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
     %2 = stablehlo.add %arg2, %arg0 : tensor<i32>
@@ -920,7 +1016,7 @@ def f(x, y):  # x: f32[b, 4]  y: f32[2, b, 4]
     %0, %1 = call @dyn_main(%arg0_new, %arg1, %arg2) : (tensor<i32>, tensor<?x4xf32>, tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>)
     return %0, %1 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
     %0 = stablehlo.constant dense<2> : tensor<1xi32>
     %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %3 = stablehlo.constant dense<4> : tensor<1xi32>
@@ -952,7 +1048,7 @@ def f(x):  # x: i32[b]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xi32>) -> tensor<i32>
     return %0 : tensor<i32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xi32>) -> tensor<i32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xi32>) -> tensor<i32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
     %1 = stablehlo.reduce(%arg1 init: %0) across dimensions = [0] : (tensor<?xi32>, tensor<i32>) -> tensor<i32>
      reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
@@ -984,7 +1080,7 @@ def f(x):  # x: f32[b, 5]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x5xf32>) -> tensor<?x1xf32>
     return %0 : tensor<?x1xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
     %1 = stablehlo.reduce(%arg1 init: %0) across dimensions = [1] : (tensor<?x5xf32>, tensor<f32>) -> tensor<?xf32>
      reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
@@ -1020,11 +1116,11 @@ def f(x):  # x: f32[b]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xf32>) -> tensor<?xi32> {
     %0 = call @f(%arg0, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
-  func.func private @f(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
+  func.func private @f(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xf32>) -> tensor<?xi32> {
     %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %1 = "stablehlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
     return %1 : tensor<?xi32>
@@ -1051,7 +1147,7 @@ def f(x):  # x: f32[b]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     return %arg1 : tensor<?xf32>
   }
 }
@@ -1081,7 +1177,7 @@ def f(x):  # x: f32[b]
     %0, %1 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>)
     return %0, %1 : tensor<?xf32>, tensor<i64>
   }
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
+  func.func private @dyn_main(%arg0: tensor<i32> {jax.global_constant = "b"}, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
     %0 = stablehlo.constant dense<0> : tensor<i64>
     %1:2 = "stablehlo.while"(%arg1, %0) ({
     ^bb0(%arg2: tensor<?xf32>, %arg3: tensor<i64>):
@@ -1123,7 +1219,7 @@ def f(x):  # x: f32[b]
     %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }}
-  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {{
+  func.func private @dyn_main(%arg0: tensor<i32> {{jax.global_constant = "b"}}, %arg1: tensor<?xf32>) -> tensor<?xf32> {{
     return %arg1 : tensor<?xf32>
   }}
 }}
@@ -1311,7 +1407,6 @@ def f(x, y):
           Sout=[res.shape],
           platforms=[self.testing_platform()],
           function_list=(foo,),
-          has_token_input_output=True,
       )
 
     self._assertOpOutputMatchesExpected(f, (x, y), (res,))
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 9c039e03135ab3..bd8b7a7a68f5ce 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1151,6 +1151,7 @@ cc_library(
     visibility = [":internal"],
     deps = [
         ":tf2xla_defs",
+        ":xla_op_registry",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -1159,6 +1160,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:cluster_tf",
+        "//tensorflow/compiler/mlir/tf2xla/api/v1:tf_dialect_to_executor",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_dialect_to_executor",
         "//tensorflow/core:core_cpu",
@@ -1171,6 +1173,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index e8d7cd052b5fba..e1605b5c3d3bdf 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -369,6 +369,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index d8e5fcefdda67b..f602e66fd9fa55 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -80,11 +80,11 @@ constexpr int kVersionStartSupportCallTFGraph = 5;
 constexpr int kVersionStartSupportDisabledChecks = 6;
 constexpr int kVersionStartSupportShapeAssertions = 7;
 constexpr int kVersionStartSupportUsesShapePolymorphismAttr = 8;
+constexpr int kVersionStartSupportEffects = 9;
 constexpr int kVersionMinimumSupported = kVersionStartStableHloCompatibility;
 
 // This should match xla.py:call_module_maximum_supported_version
-constexpr int kVersionMaximumSupported =
-    kVersionStartSupportUsesShapePolymorphismAttr;
+constexpr int kVersionMaximumSupported = kVersionStartSupportEffects;
 
 constexpr llvm::StringRef kDisabledCheckPlatform = "platform";
 
@@ -141,6 +141,11 @@ tsl::Status SetPlatformIndex(mlir::func::FuncOp main, int platform_index) {
 
 }  // namespace
 
+bool IsTokenType(mlir::Type type) {
+  return type.isa<mlir::stablehlo::TokenType>() ||
+         type.isa<mlir::mhlo::TokenType>();
+}
+
 tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> XlaCallModuleLoader::Create(
     mlir::MLIRContext *context, int version, std::string module_str,
     std::vector<std::string> disabled_checks,
@@ -236,7 +241,8 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   // Refine 'main' argument types to use static input types instead. The main
   // arguments may occur as return values, or as inputs to called functions,
   // and changing their types may invalidate the module. To prevent this
-  // we insert dummy conversion ops as the sole uses of the main arguments.
+  // we insert dummy conversion ops as the sole uses of the main arguments, for
+  // the arguments that are not tokens and have dynamic shape.
   // If we use stablehlo.convert, we end up with "convert 3xf32 -> *xf32"
   // after we set the static shapes for the main arguments. The "convert"
   // op does not support unranked result for ranked inputs. So, we use
@@ -246,9 +252,16 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   op_builder.setInsertionPointToStart(&main_body);
   for (auto i = 0; i < main_body.getNumArguments(); ++i) {
     mlir::BlockArgument arg = main_body.getArgument(i);
-    auto convert_op = op_builder.create<mlir::stablehlo::BitcastConvertOp>(
-        arg.getLoc(), arg.getType(), arg);
-    arg.replaceAllUsesExcept(convert_op, convert_op);
+    mlir::Type arg_type = arg.getType();
+    if (IsTokenType(arg_type)) {
+      continue;
+    }
+    auto ranked_arg_type = arg_type.dyn_cast<mlir::RankedTensorType>();
+    if (!ranked_arg_type || !ranked_arg_type.hasStaticShape()) {
+      auto convert_op = op_builder.create<mlir::stablehlo::BitcastConvertOp>(
+          arg.getLoc(), arg_type, arg);
+      arg.replaceAllUsesExcept(convert_op, convert_op);
+    }
   }
 
   auto static_array_output_types = llvm::to_vector(main_.getResultTypes());
@@ -376,7 +389,19 @@ tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
   }
 
   mlir::Block &main_body = main_.front();
-  int nr_token_arguments = main_has_token_input_output ? 1 : 0;
+
+  int nr_token_arguments = llvm::count_if(InputTypes(), IsTokenType);
+  if (version < kVersionStartSupportEffects) {
+    bool has_token_at_start = (nr_token_arguments == 1 &&
+                               IsTokenType(main_.getArgument(0).getType()));
+    if (main_has_token_input_output != has_token_at_start) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Expected a token at start iff main_has_token_input_output. ",
+          "Found main function type ",
+          mlir::debugString(main_.getFunctionType()),
+          " and main_has_token_input_output = ", main_has_token_input_output));
+    }
+  }
   int nr_platform_args = (platform_index_ >= 0 ? 1 : 0);
   if (num_invocation_args != main_body.getNumArguments() - nr_token_arguments) {
     return absl::InvalidArgumentError(absl::StrCat(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
index 33de164d0e453f..bb2e73e331a5a9 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
@@ -26,11 +26,15 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "xla/client/xla_computation.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 
+bool IsTokenType(mlir::Type type);
+
 class XlaCallModuleLoader {
  public:
   static tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> Create(
@@ -39,8 +43,11 @@ class XlaCallModuleLoader {
       std::vector<std::string> platforms, std::string loading_platform,
       int num_invocation_args, bool main_has_token_input_output);
 
-  int nr_outputs() { return main_.getNumResults(); }
-  mlir::TypeRange output_types() { return main_.getResultTypes(); }
+  int NrInputs() { return main_.getNumArguments(); }
+  mlir::TypeRange InputTypes() { return main_.getArgumentTypes(); }
+
+  int NrOutputs() { return main_.getNumResults(); }
+  mlir::TypeRange OutputTypes() { return main_.getResultTypes(); }
 
   // Refines the dynamic module arguments based on the static argument shapes.
   // This assumes that the module has a "main" function without dimension args,
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index 200b24951ed68b..049142034f90b6 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -19,10 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -150,7 +152,36 @@ class XlaCallModuleOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("disabled_checks", &disabled_checks));
     std::vector<string> platforms;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
+    // TODO(necula): change this to OP_REQUIRES_OK when 6 months have passed
+    // since we added the function_list and has_token_input_output
+    // attributes (May 25, 2023).
+    bool main_has_token_input_output = false;
+    if (!ctx->GetAttr("has_token_input_output", &main_has_token_input_output)
+             .ok()) {
+      // Whether the StableHLO module's main function has token input/output as
+      // the first argument and the first result.
+      // This is used only prior to version 9; afterwards, we just look for
+      // tokens among the types of the arguments and results, and we support
+      // multiple tokens, not necessarily at the start.
+      main_has_token_input_output = false;
+    }
+    if (!ctx->GetAttr("function_list", &function_list_).ok()) {
+      function_list_.clear();
+    }
 
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << "Initializing XlaCallModuleOp (version = " << version
+              << ", platforms = [" << absl::StrJoin(platforms, ", ")
+              << "], has_token_input_output = " << main_has_token_input_output
+              << ", disabled_checks = [" << absl::StrJoin(disabled_checks, ", ")
+              << "], "
+              << "function_list = ["
+              << absl::StrJoin(function_list_, ",",
+                               [](std::string *out, NameAttrList x) {
+                                 absl::StrAppend(out, x.name());
+                               })
+              << "])";
+    }
     string loading_device_type = ctx->device_type().type_string();
     string loading_platform = "";
     if (loading_device_type == DEVICE_CPU_XLA_JIT) {
@@ -171,31 +202,21 @@ class XlaCallModuleOp : public XlaOpKernel {
                   absl::UnimplementedError(absl::StrCat(
                       "Unexpected device type ", loading_device_type)));
     }
-    VLOG(3) << "Initialized XlaCallModuleOp on " << loading_platform;
-    if (!ctx->GetAttr("has_token_input_output", &module_has_token_input_output_)
-             .ok()) {
-      module_has_token_input_output_ = false;
-    }
+    VLOG(3) << "Initializing XlaCallModuleOp on " << loading_platform;
     {
       auto loader = XlaCallModuleLoader::Create(
           &context_, version, std::move(module_str), std::move(disabled_checks),
           std::move(platforms), loading_platform,
           /*num_invocation_args=*/ctx->num_inputs(),
-          module_has_token_input_output_);
+          main_has_token_input_output);
       OP_REQUIRES_OK(ctx, loader.status());
       loader_ = *std::move(loader);
     }
     OP_REQUIRES_OK(ctx, loader_->ValidateDialect());
 
-    if (!ctx->GetAttr("function_list", &function_list_).ok()) {
-      function_list_.clear();
-    }
-
-    if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_).ok()) {
-      token_input_nodes_.clear();
-      op_has_token_input_output_ = false;
-    } else {
-      op_has_token_input_output_ = !token_input_nodes_.empty();
+    if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &op_token_input_nodes_)
+             .ok()) {
+      op_token_input_nodes_.clear();
     }
     if (!ctx->GetAttr(kXlaOriginalOutsideCompilationNodeName,
                       &original_node_name_)
@@ -213,13 +234,15 @@ class XlaCallModuleOp : public XlaOpKernel {
     xla::XlaBuilder *const b = ctx->builder();
 
     std::vector<xla::Shape> input_shapes;
-    if (module_has_token_input_output_) {
-      input_shapes.push_back(xla::ShapeUtil::MakeTokenShape());
-    }
-    for (int i = 0; i < ctx->num_inputs(); ++i) {
-      auto shape = ctx->InputXlaShape(i);
-      OP_REQUIRES_OK(ctx, shape.status());
-      input_shapes.push_back(*std::move(shape));
+    int next_actual_input = 0;
+    for (mlir::Type inputType : loader_->InputTypes()) {
+      if (IsTokenType(inputType)) {
+        input_shapes.push_back(xla::ShapeUtil::MakeTokenShape());
+      } else {
+        auto shape = ctx->InputXlaShape(next_actual_input++);
+        OP_REQUIRES_OK(ctx, shape.status());
+        input_shapes.push_back(*std::move(shape));
+      }
     }
     OP_REQUIRES_OK(ctx, loader_->RefineDynamicShapes(input_shapes));
     OP_REQUIRES_OK(ctx, loader_->ValidateStaticShapes());
@@ -228,27 +251,30 @@ class XlaCallModuleOp : public XlaOpKernel {
       OP_REQUIRES_OK(ctx, LowerTfFunctionCalls(ctx));
     }
 
+    xla::XlaOp token_input;
+    if (!op_token_input_nodes_.empty()) {
+      std::vector<xla::XlaOp> token_inputs;
+      for (const string &node_name : op_token_input_nodes_) {
+        auto token = compiler->GetNodeToken(node_name);
+        OP_REQUIRES_OK(ctx, token.status());
+        token_inputs.push_back(token.value());
+      }
+      token_input = xla::AfterAll(b, token_inputs);
+    }
+
     std::vector<xla::XlaOp> inputs;
-    if (module_has_token_input_output_) {
-      // The main function expects a token input at the start.
-      if (!token_input_nodes_.empty()) {
-        std::vector<xla::XlaOp> token_inputs;
-        for (const string &node_name : token_input_nodes_) {
-          auto token = compiler->GetNodeToken(node_name);
-          OP_REQUIRES_OK(ctx, token.status());
-          token_inputs.push_back(token.value());
+    next_actual_input = 0;
+    for (mlir::Type inputType : loader_->InputTypes()) {
+      if (IsTokenType(inputType)) {
+        if (token_input.IsUninitialized()) {
+          // Generate a dummy token if the XlaCallModule doesn't take one.
+          token_input = xla::CreateToken(b);
         }
-        inputs.push_back(xla::AfterAll(b, token_inputs));
+        inputs.push_back(token_input);
       } else {
-        // Generate a dummy token if the main function expects a token but the
-        // XlaCallModule doesn't take one.
-        inputs.push_back(xla::CreateToken(b));
+        inputs.push_back(ctx->Input(next_actual_input++));
       }
     }
-    for (int i = 0, end = ctx->num_inputs(); i < end; ++i) {
-      inputs.push_back(ctx->Input(i));
-    }
-
     auto xla_computation = loader_->ToXlaComputation();
     OP_REQUIRES_OK(ctx, xla_computation.status());
 
@@ -266,47 +292,58 @@ class XlaCallModuleOp : public XlaOpKernel {
                                      hlo_module->ToString(options)));
     }
 
-    xla::XlaOp output = xla::Call(b, *xla_computation, inputs);
+    xla::XlaOp computation_output = xla::Call(b, *xla_computation, inputs);
 
     // Check that the resulting computation returns the expected shape
-    OP_REQUIRES_VALUE(xla::Shape found_output_shape, ctx, b->GetShape(output));
+    OP_REQUIRES_VALUE(xla::Shape found_output_shape, ctx,
+                      b->GetShape(computation_output));
     VLOG(3) << "XlaCallModule compiled output shape : "
             << xla::ShapeUtil::HumanString(found_output_shape);
-
-    std::vector<xla::XlaOp> outputs;
-    if (loader_->nr_outputs() == 1) {
-      outputs.push_back(output);
+    std::vector<xla::XlaOp> computation_outputs;
+    if (loader_->NrOutputs() == 1) {
+      computation_outputs.push_back(computation_output);
     } else {
-      for (int i = 0; i < loader_->nr_outputs(); ++i) {
-        outputs.push_back(xla::GetTupleElement(output, i));
+      for (int i = 0; i < loader_->NrOutputs(); ++i) {
+        computation_outputs.push_back(
+            xla::GetTupleElement(computation_output, i));
       }
     }
 
-    xla::XlaOp token_output;
-    if (module_has_token_input_output_) {
-      // The main function returns a token as the first output.
-      token_output = outputs.front();
-      outputs.erase(outputs.begin());
-      auto shape = b->GetShape(token_output);
+    // Collect the token outputs and set the non-token outputs
+    std::vector<xla::XlaOp> token_outputs;
+    int next_actual_output = 0;
+    for (auto it : llvm::enumerate(loader_->OutputTypes())) {
+      int i = it.index();
+      mlir::Type output_type = it.value();
+      auto shape = b->GetShape(computation_outputs[i]);
       OP_REQUIRES_OK(ctx, shape.status());
-      OP_REQUIRES(ctx, shape->IsToken(),
-                  absl::FailedPreconditionError(
-                      absl::StrCat("Token output is not token type: ",
-                                   xla::ShapeUtil::HumanString(*shape))));
+      if (IsTokenType(output_type)) {
+        OP_REQUIRES(ctx, shape->IsToken(),
+                    absl::FailedPreconditionError(absl::StrCat(
+                        "Token output at index ", i, " is not token type: ",
+                        xla::ShapeUtil::HumanString(*shape))));
+        token_outputs.push_back(computation_outputs[i]);
+      } else {
+        OP_REQUIRES(ctx, !shape->IsToken(),
+                    absl::FailedPreconditionError(absl::StrCat(
+                        "Non-token output at index ", i, " is a token type: ",
+                        xla::ShapeUtil::HumanString(*shape))));
+        ctx->SetOutput(next_actual_output++, computation_outputs[i]);
+      }
     }
-    if (op_has_token_input_output_) {
-      if (token_output.IsUninitialized()) {
-        // The main function does not return any token, but the XlaCallModule is
-        // expected to return one. Create a dummy token.
-        token_output = xla::CreateToken(b);
+
+    if (!op_token_input_nodes_.empty()) {
+      xla::XlaOp token_output = token_input;
+      if (!token_outputs.empty()) {
+        token_output = xla::AfterAll(b, token_outputs);
+      } else {
+        if (token_output.IsUninitialized()) {
+          token_output = xla::CreateToken(b);
+        }
       }
       OP_REQUIRES_OK(ctx,
                      compiler->SetNodeToken(original_node_name_, token_output));
     }
-
-    for (int i = 0; i < outputs.size(); ++i) {
-      ctx->SetOutput(i, outputs[i]);
-    }
   }
 
  private:
@@ -404,7 +441,7 @@ class XlaCallModuleOp : public XlaOpKernel {
       options.always_return_tuple = true;
       options.is_entry_computation = false;
       // Propagate tokens from XlaCallModule to inner computation.
-      options.add_token_input_output = op_has_token_input_output_;
+      options.add_token_input_output = !op_token_input_nodes_.empty();
 
       XlaCompiler::CompilationResult result;
       TF_RETURN_IF_ERROR(
@@ -518,11 +555,8 @@ class XlaCallModuleOp : public XlaOpKernel {
   std::unique_ptr<XlaCallModuleLoader> loader_;
   std::vector<NameAttrList> function_list_;
 
-  // Whether the StableHLO module's main function has token input/output.
-  bool module_has_token_input_output_;
   // Whether the XlaCallModule op has token input/output.
-  bool op_has_token_input_output_;
-  std::vector<std::string> token_input_nodes_;
+  std::vector<std::string> op_token_input_nodes_;
   std::string original_node_name_;
 };
 
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 6df7dded2383e5..b69aa2be5e2e0c 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -22,18 +22,23 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -51,6 +56,8 @@ auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
 
 namespace {
 
+using ::mlir::ModuleOp;
+
 bool HasTPUDevice(mlir::ModuleOp module) {
   mlir::TF::RuntimeDevices devices;
   if (failed(GetDevicesFromOp(module.getOperation(), &devices))) return false;
@@ -132,6 +139,33 @@ bool HasTPUPartitionedCallOpInModule(mlir::ModuleOp module) {
   return has_tpu_partitioned_call;
 }
 
+// V1 Compat Bridge extracts out a program into a submodule and runs clustering
+// only on the submodule.
+absl::Status RunLowerToRuntimeOpsOnSubmodule(ModuleOp parent_module,
+                                             bool is_in_fallback_enabled_mode) {
+  int num_submodules = 0;
+  absl::Status runtime_lowering_status;
+  parent_module.walk([&](ModuleOp submodule) {
+    if (submodule == parent_module) return mlir::WalkResult::advance();
+    num_submodules++;
+    runtime_lowering_status =
+        tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
+            submodule, tsl::DeviceType(DEVICE_TPU_XLA_JIT));
+    if (num_submodules > 1) {
+      return mlir::WalkResult::interrupt();
+    }
+
+    return mlir::WalkResult::advance();
+  });
+
+  if (num_submodules > 1) {
+    return absl::InternalError(
+        "Lower to runtime has more than one submodule. Erroring out.");
+  }
+
+  return runtime_lowering_status;
+}
+
 }  // namespace
 
 // Analyzes the user requested policy as well as the contents of the graph and
@@ -270,8 +304,8 @@ Status MlirBridgePass::Run(const std::string& function_name,
     return OkStatus();
   }
 
+  bool fallback_enabled = false;
   if (run_tpu_bridge) {
-    bool fallback_enabled = false;
     if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
@@ -293,14 +327,18 @@ Status MlirBridgePass::Run(const std::string& function_name,
     TF_RETURN_IF_ERROR(
         tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
             module, tsl::DeviceType(DEVICE_TPU_XLA_JIT)));
-
+  } else {
+    VLOG(1) << "Running GPU/CPU Bridge";
     TF_RETURN_IF_ERROR(
-        tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(module));
+        tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
+            module, tf2xla::v2::DeviceType::XLA_GPU_JIT, fallback_enabled));
 
-    return absl::OkStatus();
+    TF_RETURN_IF_ERROR(
+        tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
+            module, tsl::DeviceType(DEVICE_GPU_XLA_JIT)));
   }
-  VLOG(1) << "Running MLIR CPU/GPU Bridge";
-  return mlir::TF::RunTFXLABridge(module, function_name);
+
+  return tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(module);
 }
 
 MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
@@ -402,10 +440,19 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
   }
 
   VLOG(1) << "Running MLIR TPU Bridge V1 Compat";
-
   mlir_bridge_gauge_v1->GetCell()->Set(true);
-  return tensorflow::tf2xla::v1::RunSessionTf2xlaClusteringBridge(
-      module, fallback_enabled);
+  TF_RETURN_IF_ERROR(tensorflow::tf2xla::v1::RunSessionTf2xlaClusteringBridge(
+      module, fallback_enabled));
+
+  auto lower_cluster_to_runtime_ops_pass_pipeline =
+      RunLowerToRuntimeOpsOnSubmodule(module, fallback_enabled);
+  if (!lower_cluster_to_runtime_ops_pass_pipeline.ok()) {
+    VLOG(1) << "Error while lowering cluster to runtime ops: "
+            << lower_cluster_to_runtime_ops_pass_pipeline;
+    return lower_cluster_to_runtime_ops_pass_pipeline;
+  }
+
+  return tensorflow::tf2xla::v1::ExportFromTensorflowDialectToExecutor(module);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 4aa2811585c8c8..480dc474410359 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -1392,7 +1392,10 @@ function_list: This list contains the TensorFlow FunctionDefs that are used by
 has_token_input_output: If true, the embedded StableHLO module's main function
   must take a `!stablehlo.token` as its first argument and returns a token as
   its first result. This can be used in conjunction with the TF2XLA's side
-  effect mechanism in order to model side effects.
+  effect mechanism in order to model side effects. This is used only in versions
+  prior to version 9. After that, the number and position of tokens among
+  the arguments and results are obtained from the main function type. This
+  allows us to support more than one token and not necessarily at the start.
 disabled_checks: A list of strings describing the safety checks that were
   disabled at serialization time. This attribute was added in version 6.
   For more details see
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 80eecfb0d2f7e2..27940b7fb92c17 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -669,7 +669,7 @@ def call_module_maximum_supported_version():
   See versioning details documentation for the XlaCallModule op at:
   https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+VERSION_MAXIMUM_SUPPORTED%22&type=code
   """
-  return 8
+  return 9
 
 # pylint: enable=g-doc-args
 # pylint: enable=g-doc-return-or-yield
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 4884972676b864..b3efda8f4791b7 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -6,12 +6,12 @@
 #   :python_api_def
 #   :java_api_def
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -116,5 +116,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/tpu/ops:sparse_core_ops",
+        "//tensorflow/core/tpu/ops:sparse_core_preprocess_ops",
+        "//tensorflow/core/tpu/ops:tpu_copy_with_dynamic_shape_op",
     ],
 )
diff --git a/tensorflow/core/api_def/base_api/api_def_ConvertToCooTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConvertToCooTensor.pbtxt
new file mode 100644
index 00000000000000..c9d629514f49f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConvertToCooTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToCooTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt
new file mode 100644
index 00000000000000..e402d2bf92de67
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetMinibatchSplitsWithPhysicalReplica"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt
new file mode 100644
index 00000000000000..49493ee2254354
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetMinibatchesInCsrWithPhysicalReplica"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt b/tensorflow/core/api_def/base_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt
new file mode 100644
index 00000000000000..9545ffdf9c4b7e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StoreMinibatchStatisticsInFdo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt
new file mode 100644
index 00000000000000..84ac58c2e7e5ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUAnnotateTensorsWithDynamicShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCopyWithDynamicShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCopyWithDynamicShape.pbtxt
new file mode 100644
index 00000000000000..423e1a22244a4b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCopyWithDynamicShape.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TPUCopyWithDynamicShape"
+  visibility: HIDDEN
+  summary: <<END
+Op that copies host tensor to device with dynamic shape support.
+For internal use only.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagrad.pbtxt
new file mode 100644
index 00000000000000..4775d3de205e9a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt
new file mode 100644
index 00000000000000..8c0f70d7fb7dbf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdagradMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdam.pbtxt
new file mode 100644
index 00000000000000..289521f4b5164b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreAdam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreFtrl.pbtxt
new file mode 100644
index 00000000000000..fc3122b5c92fdf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreSgd.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreSgd.pbtxt
new file mode 100644
index 00000000000000..9ea35a6e0de4a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseCoreSgd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreSgd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmul.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmul.pbtxt
new file mode 100644
index 00000000000000..acc0d15516bc2f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..478d54429df471
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..c8e69c32e18627
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..62f6b15d8e67b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..36a0c2452e92c6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..b5d12b414fcbc4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt
new file mode 100644
index 00000000000000..e16a65cecef7dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConvertToCooTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConvertToCooTensor.pbtxt
new file mode 100644
index 00000000000000..c9d629514f49f7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConvertToCooTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToCooTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt
new file mode 100644
index 00000000000000..e402d2bf92de67
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetMinibatchSplitsWithPhysicalReplica.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetMinibatchSplitsWithPhysicalReplica"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt
new file mode 100644
index 00000000000000..49493ee2254354
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetMinibatchesInCsrWithPhysicalReplica.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetMinibatchesInCsrWithPhysicalReplica"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt b/tensorflow/core/api_def/python_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt
new file mode 100644
index 00000000000000..9545ffdf9c4b7e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StoreMinibatchStatisticsInFdo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StoreMinibatchStatisticsInFdo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt
new file mode 100644
index 00000000000000..84ac58c2e7e5ea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TPUAnnotateTensorsWithDynamicShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUAnnotateTensorsWithDynamicShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TPUCopyWithDynamicShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_TPUCopyWithDynamicShape.pbtxt
new file mode 100644
index 00000000000000..f628f72e6b7a74
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TPUCopyWithDynamicShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUCopyWithDynamicShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagrad.pbtxt
new file mode 100644
index 00000000000000..4775d3de205e9a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdagrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt
new file mode 100644
index 00000000000000..8c0f70d7fb7dbf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdagradMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdagradMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdam.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdam.pbtxt
new file mode 100644
index 00000000000000..289521f4b5164b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreAdam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreAdam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreFtrl.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreFtrl.pbtxt
new file mode 100644
index 00000000000000..fc3122b5c92fdf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreFtrl"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreSgd.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreSgd.pbtxt
new file mode 100644
index 00000000000000..9ea35a6e0de4a2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseCoreSgd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseCoreSgd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmul.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmul.pbtxt
new file mode 100644
index 00000000000000..acc0d15516bc2f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..478d54429df471
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..c8e69c32e18627
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..62f6b15d8e67b7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..36a0c2452e92c6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
new file mode 100644
index 00000000000000..b5d12b414fcbc4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt
new file mode 100644
index 00000000000000..e16a65cecef7dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 5d415c719aff5f..ebcd0339bc61c1 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,7 +1,6 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
-    "if_google",
     "if_libtpu",
     "if_macos",
     "if_oss",
@@ -50,13 +49,6 @@ default_package_visibility = [
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_package_visibility,
-    features = if_google(
-        [
-            "-layering_check",
-            "-parse_headers",
-        ],
-        ["-layering_check"],
-    ),
     licenses = ["notice"],
 )
 
@@ -78,6 +70,7 @@ tf_cuda_library(
         "//tensorflow/core/public:session.h",
         "//tensorflow/core/public:session_options.h",
     ],
+    features = ["-layering_check"],
     visibility = ["//visibility:public"],
     deps = [
         ":core_cpu_internal",
@@ -86,6 +79,7 @@ tf_cuda_library(
 
 cc_header_only_library(
     name = "core_cpu_headers_lib",
+    features = ["-parse_headers"],
     visibility = ["//visibility:public"],
     deps = [
         ":core_cpu_lib",
@@ -137,6 +131,7 @@ cc_library(
     srcs = ["collective_test_util.cc"],
     hdrs = ["collective_test_util.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device_resolver_local",
         ":process_util",
@@ -226,6 +221,7 @@ tf_cuda_library(
         "//tensorflow/core/public:session.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":scoped_allocator",
         ":stats_publisher_interface",
@@ -333,6 +329,7 @@ cc_library(
     srcs = ["all_to_all.cc"],
     hdrs = ["all_to_all.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":base_collective_executor",
         ":collective_rma_local",
@@ -356,6 +353,7 @@ cc_library(
         "arg_ret_placement.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:graph",
     ],
@@ -387,6 +385,7 @@ cc_library(
     srcs = ["buf_rendezvous.cc"],
     hdrs = ["buf_rendezvous.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":device_mgr",
@@ -463,6 +462,7 @@ cc_library(
     srcs = ["collective_param_resolver_local.cc"],
     hdrs = ["collective_param_resolver_local.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device_mgr",
         "//tensorflow/core:framework",
@@ -476,6 +476,7 @@ cc_library(
     srcs = ["collective_rma_local.cc"],
     hdrs = ["collective_rma_local.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":buf_rendezvous",
         ":copy_tensor",
@@ -498,6 +499,7 @@ cc_library(
         "inspecting_placer.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":composite_device",
         ":device",
@@ -523,6 +525,7 @@ cc_library(
     srcs = ["composite_device.cc"],
     hdrs = ["composite_device.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         "//tensorflow/core:framework",
@@ -536,6 +539,7 @@ cc_library(
     srcs = ["constant_folding.cc"],
     hdrs = ["constant_folding.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":device_factory",
@@ -557,6 +561,7 @@ cc_library(
     srcs = ["costmodel_manager.cc"],
     hdrs = ["costmodel_manager.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -570,6 +575,7 @@ cc_library(
     srcs = ["debugger_state_interface.cc"],
     hdrs = ["debugger_state_interface.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         "//tensorflow/core:graph",
@@ -581,6 +587,7 @@ cc_library(
     name = "device",
     hdrs = ["device.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework_internal",
     ],
@@ -590,6 +597,7 @@ cc_library(
     name = "device_factory",
     hdrs = ["device_factory.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework_internal",
     ],
@@ -603,6 +611,7 @@ cc_library(
     ],
     hdrs = ["device_mgr.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":local_device",
@@ -632,6 +641,7 @@ cc_library(
     name = "entry",
     hdrs = ["entry.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -643,6 +653,7 @@ cc_library(
     srcs = ["executor.cc"],
     hdrs = ["executor.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":costmodel_manager",
         ":device",
@@ -692,6 +703,7 @@ cc_library(
     srcs = ["type_inference.cc"],
     hdrs = ["type_inference.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     visibility = default_package_visibility,
     deps = [
         ":optimization_registry",
@@ -708,6 +720,7 @@ cc_library(
     srcs = ["single_threaded_executor.cc"],
     hdrs = ["single_threaded_executor.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":entry",
         ":executor",
@@ -754,6 +767,7 @@ tf_cc_test(
     name = "type_inference_test",
     size = "small",
     srcs = ["type_inference_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -810,6 +824,7 @@ cc_library(
     srcs = ["device_set.cc"],
     hdrs = ["device_set.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":device_factory",
@@ -837,6 +852,7 @@ cc_library(
         "process_function_library_runtime.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":arg_ret_placement",
         ":composite_device",
@@ -913,6 +929,7 @@ cc_library(
     srcs = ["function_def_utils.cc"],
     hdrs = ["function_def_utils.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":function_body",
         ":graph_constructor",
@@ -973,6 +990,7 @@ cc_library(
         ":core_cpu_lib_headers",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     visibility = default_package_visibility + [
         "//platforms/performance/autograppler:__subpackages__",
         "//platforms/performance/tf_sim:__subpackages__",
@@ -1026,6 +1044,7 @@ cc_library(
     srcs = ["graph_optimizer.cc"],
     hdrs = ["graph_optimizer.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":constant_folding",
         ":function_utils",
@@ -1073,6 +1092,7 @@ cc_library(
     srcs = ["immutable_executor_state.cc"],
     hdrs = ["immutable_executor_state.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":graph_view",
         ":local_executor_params",
@@ -1164,6 +1184,7 @@ cc_library(
     srcs = ["local_device.cc"],
     hdrs = ["local_device.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":process_state",
@@ -1260,6 +1281,7 @@ cc_library(
         "optimization_registry.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":optimization_registry",
         "//tensorflow/core:framework",
@@ -1394,6 +1416,7 @@ cc_library(
     srcs = ["node_file_writer.cc"],
     hdrs = ["node_file_writer.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1449,6 +1472,7 @@ cc_library(
     name = "pending_counts",
     hdrs = ["pending_counts.h"],
     copts = tf_copts(),
+    features = ["-parse_headers"],
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -1480,6 +1504,7 @@ cc_library(
     srcs = ["pool_allocator.cc"],
     hdrs = ["pool_allocator.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1493,6 +1518,7 @@ cc_library(
     srcs = ["placer.cc"],
     hdrs = ["placer.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":colocation_graph",
         ":device",
@@ -1540,6 +1566,7 @@ cc_library(
     srcs = ["process_util.cc"],
     hdrs = ["process_util.h"],
     copts = tf_copts() + tf_openmp_copts(),
+    features = ["-layering_check"],
     linkopts = tf_openmp_lopts(),
     deps = [
         ":session_options",
@@ -1553,6 +1580,7 @@ cc_library(
     name = "profile_handler",
     hdrs = ["profile_handler.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -1623,6 +1651,7 @@ cc_library(
     srcs = ["renamed_device.cc"],
     hdrs = ["renamed_device.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":device",
         "//tensorflow/core:lib",
@@ -1637,6 +1666,7 @@ cc_library(
     srcs = ["rendezvous_mgr.cc"],
     hdrs = ["rendezvous_mgr.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":copy_tensor",
         ":device",
@@ -1715,6 +1745,7 @@ cc_library(
     srcs = ["rendezvous_util.cc"],
     hdrs = ["rendezvous_util.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1773,6 +1804,7 @@ cc_library(
     srcs = ["session.cc"],
     hdrs = ["//tensorflow/core/public:session.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":session_factory",
         "//tensorflow/core:framework",
@@ -1862,6 +1894,7 @@ cc_library(
     srcs = ["stats_publisher_interface.cc"],
     hdrs = ["stats_publisher_interface.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":build_graph_options",
         ":profile_handler",
@@ -1890,6 +1923,7 @@ cc_library(
     srcs = ["threadpool_device.cc"],
     hdrs = ["threadpool_device.h"],
     copts = tf_copts() + tf_openmp_copts(),
+    features = ["-layering_check"],
     linkopts = tf_openmp_lopts(),
     deps = [
         ":device_factory",
@@ -1929,6 +1963,7 @@ tf_cuda_library(
     name = "core_cpu_impl",
     hdrs = [":core_cpu_lib_headers"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":accumulate_n_optimizer",
         ":all_to_all",
@@ -2000,6 +2035,7 @@ tf_cuda_library(
 tf_cuda_library(
     name = "core_cpu_lib",
     hdrs = [":core_cpu_lib_headers"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core/grappler:grappler_item",
@@ -2009,6 +2045,7 @@ tf_cuda_library(
 tf_cuda_library(
     name = "core_cpu_lib_no_ops",
     hdrs = [":core_cpu_lib_headers"],
+    features = ["-layering_check"],
     deps = [
         ":core_cpu_base_no_ops",
         "//tensorflow/core/grappler:grappler_item",
@@ -2025,6 +2062,7 @@ tf_cuda_library(
         ":core_cpu_lib_headers",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -2056,7 +2094,10 @@ cc_library(
         "allocator_retry.h",
     ],
     hdrs = ["bfc_allocator.h"],
-    features = ["parse_headers"],
+    features = [
+        "-layering_check",
+        "parse_headers",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":shared_counter",
@@ -2092,6 +2133,7 @@ tf_cuda_library(
         "direct_session.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":core_cpu_internal",
         ":local_session_selection",
@@ -2308,6 +2350,7 @@ tf_cc_test(
 tf_cuda_cc_test(
     name = "all_to_all_test",
     srcs = ["all_to_all_test.cc"],
+    features = ["-layering_check"],
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":collective_test_util",
@@ -2406,6 +2449,7 @@ tf_cc_tests(
     srcs = [
         "replicate_constants_pass_test.cc",
     ],
+    features = ["-layering_check"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2695,6 +2739,7 @@ tf_cuda_cc_test(
     name = "process_function_library_runtime_test",
     size = "small",
     srcs = ["process_function_library_runtime_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2799,6 +2844,7 @@ tf_cuda_cc_test(
     size = "medium",
     srcs = ["direct_session_test.cc"],
     args = [] + if_cuda(["--heap_check="]),  # The GPU tracer leaks memory
+    features = ["-layering_check"],
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2838,6 +2884,7 @@ tf_cuda_cc_test(
 tf_cc_test(
     name = "direct_session_with_debug_test",
     srcs = ["direct_session_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2972,6 +3019,7 @@ tf_cc_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.cc"],
+    features = ["-layering_check"],
     tags = [
         "manual",
         "no_oss",
@@ -3059,6 +3107,7 @@ tf_cc_test(
     name = "inline_function_utils_test",
     size = "small",
     srcs = ["inline_function_utils_test.cc"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:test_main",
@@ -3228,6 +3277,7 @@ tf_cc_test(
     name = "graph_constructor_test",
     size = "small",
     srcs = ["graph_constructor_test.cc"],
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -3260,6 +3310,7 @@ tf_cc_test(
 tf_cc_test(
     name = "cost_measurement_registry_test",
     srcs = ["cost_measurement_registry_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":cost_measurement_registry",
         "//tensorflow/core:test",
@@ -3271,6 +3322,7 @@ tf_cc_test(
 tf_cc_test(
     name = "no_op_cost_measurement_test",
     srcs = ["no_op_cost_measurement_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":no_op_cost_measurement",
         "//tensorflow/core:test",
@@ -3313,6 +3365,7 @@ tf_cc_test(
 tf_cc_test(
     name = "cost_util_test",
     srcs = ["cost_util_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":cost_measurement_registry",
         ":cost_util",
@@ -3327,6 +3380,7 @@ tf_cc_test(
     name = "device_propagation_test",
     size = "small",
     srcs = ["device_propagation_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":device_propagation",
         "//tensorflow/cc:array_ops",
@@ -3345,6 +3399,7 @@ cc_library(
     srcs = ["optimized_function_graph_info.cc"],
     hdrs = ["optimized_function_graph_info.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     visibility = ["//visibility:public"],
     deps = [
         ":graph_constructor",
@@ -3356,6 +3411,7 @@ cc_library(
 tf_cc_test(
     name = "optimized_function_graph_info_test",
     srcs = ["optimized_function_graph_info_test.cc"],
+    features = ["-layering_check"],
     tags = if_oss([
         "no_oss",
     ]),  # b/169705709, no protobuf matchers in OSS.
@@ -3378,6 +3434,7 @@ cc_library(
     srcs = ["optimize_function_graph_utils.cc"],
     hdrs = ["optimize_function_graph_utils.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":composite_device",
         ":device_set",
@@ -3404,6 +3461,7 @@ cc_library(
 tf_cc_test(
     name = "optimize_function_graph_utils_test",
     srcs = ["optimize_function_graph_utils_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":device",
         ":device_factory",
@@ -3426,6 +3484,7 @@ tf_cc_test(
     srcs = [
         "int32_fulltype_test.cc",
     ],
+    features = ["-layering_check"],
     deps = [
         ":core",
         ":int32_fulltype",
@@ -3443,6 +3502,7 @@ tf_cc_test(
     srcs = [
         "arg_ret_placement_test.cc",
     ],
+    features = ["-layering_check"],
     deps = [
         ":arg_ret_placement",
         "//tensorflow/cc:scope",
@@ -3480,6 +3540,7 @@ cc_library(
     srcs = ["serving_device_selector_policies.cc"],
     hdrs = ["serving_device_selector_policies.h"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         ":serving_device_selector",
     ],
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 2e6670994da2c5..93371be2fc12ab 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1586,11 +1586,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // impact.
     TF_CHECK_OK(GetNodeAttr(n->def(), "transpose_a", &trans_a));
 
-    // Only rewrite float and bfloat16.
-    DataType T_m;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T_m));
-
-    return !trans_a && (T_m == DT_FLOAT || T_m == DT_BFLOAT16);
+    return !trans_a;
   }
 
   // Check if we are performing pooling on depth or batch. If it is, then we
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 990dfea6889a00..bdca5f464e29e6 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -463,44 +463,13 @@ ProcessFunctionLibraryRuntime::AsyncAttributes::Summarize(const Graph* graph) {
 
 void ProcessFunctionLibraryRuntime::PublishSubgraphs(
     const std::string& function_name,
-    std::unique_ptr<std::unordered_map<std::string, std::unique_ptr<Graph>>>
-        subgraphs) {
-  // Use shared_ptr since std::function cannot capture move-only objects
-  auto subgraphs_new =
-      std::shared_ptr<std::unordered_map<std::string, std::unique_ptr<Graph>>>(
-          subgraphs.release());
-  auto completed = std::make_unique<tsl::Notification>();
-  // Converting graphs to GraphDefs involves expensive copies. Delegate the work
-  // to a separate thread to unblock the caller.
-  std::function<void()> thread_fn = [this, function_name, n = completed.get(),
-                                     subgraphs = subgraphs_new]() {
-    std::unique_ptr<StatsPublisherInterface> stats_publisher =
-        stats_publisher_factory_(function_name, BuildGraphOptions(),
-                                 SessionOptions());
-    std::vector<GraphDef> published_graph_defs;
-    published_graph_defs.reserve(subgraphs->size());
-    for (const auto& pair : *subgraphs) {
-      Graph* subgraph = pair.second.get();
-      GraphDef gd;
-      subgraph->ToGraphDef(&gd);
-      published_graph_defs.push_back(std::move(gd));
-    }
-    stats_publisher->PublishGraphProto(std::move(published_graph_defs));
-    {
-      mutex_lock l(mu_);
-      stats_publishers_.push_back(std::move(stats_publisher));
-    }
-    n->Notify();
-  };
-  {
-    mutex_lock l(mu_);
-    stats_publisher_completed_.push_back(std::move(completed));
-  }
-  if (default_thread_pool_ != nullptr) {
-    default_thread_pool_->Schedule(std::move(thread_fn));
-  } else {
-    env_->SchedClosure(std::move(thread_fn));
-  }
+    std::vector<core::RefCountPtr<FunctionRecord>>&& function_records) {
+  std::unique_ptr<StatsPublisherInterface> stats_publisher =
+      stats_publisher_factory_(function_name, BuildGraphOptions(),
+                               SessionOptions());
+  stats_publisher->PublishGraphProto(std::move(function_records));
+  mutex_lock l(mu_);
+  stats_publishers_.push_back(std::move(stats_publisher));
 }
 
 Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
@@ -669,10 +638,10 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // Instantiate each component function (subgraph).
   for (const auto& pair : *subgraphs) {
     Status* status = &instantiate_status[i];
-    string unique_name = name_generator.GetName();
     ComponentFunctionData* comp_data = &data->glue_[pair.first];
-    runner([this, &pair, dev_set, comp_data, unique_name, data_lib_def,
-            &control_ret, &options, status, &counter, &data] {
+    comp_data->name = name_generator.GetName();
+    runner([this, &pair, dev_set, comp_data, data_lib_def, &control_ret,
+            &options, status, &counter, &data] {
       const string& target = pair.first;
 
       const string& device_type =
@@ -699,7 +668,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       }
       FunctionDef shard;
       status->Update(
-          GraphToFunctionDef(*subgraph, unique_name, control_ret, &shard));
+          GraphToFunctionDef(*subgraph, comp_data->name, control_ret, &shard));
       if (!status->ok()) {
         counter.DecrementCount();
         return;
@@ -729,17 +698,18 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       ints_on_device_attr.set_b(options.int_args_and_retvals_on_device);
       attrs.insert(
           {FunctionLibraryDefinition::kIntsOnDeviceAttr, ints_on_device_attr});
-      VLOG(1) << "Start instantiating component function " << unique_name
+      VLOG(1) << "Start instantiating component function " << comp_data->name
               << " on device " << target;
       VLOG(4) << DebugString(shard);
 
       auto* component_handle = new FunctionLibraryRuntime::Handle;
-      auto done = [this, status, unique_name, comp_data, component_handle,
-                   &data, &counter](const Status& s) {
+      auto done = [this, status, comp_data, component_handle, &data,
+                   &counter](const Status& s) {
         status->Update(s);
 
-        VLOG(1) << "Finished instantiating component function " << unique_name
-                << " with handle " << *component_handle << " status: " << s;
+        VLOG(1) << "Finished instantiating component function "
+                << comp_data->name << " with handle " << *component_handle
+                << " status: " << s;
         if (status->ok()) {
           {
             mutex_lock l(mu_);
@@ -756,13 +726,13 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       FunctionLibraryRuntime* flr = GetFLR(opts.target);
       if (flr != nullptr) {
         // Initialize local function synchronously.
-        Status s = flr->Instantiate(unique_name, AttrSlice(&attrs), opts,
+        Status s = flr->Instantiate(comp_data->name, AttrSlice(&attrs), opts,
                                     component_handle);
         done(s);
       } else {
         opts.ret_indices = comp_data->ret_indices;
         // Initialize remote function asynchronously.
-        InstantiateRemote(unique_name, AttrSlice(&attrs), opts,
+        InstantiateRemote(comp_data->name, AttrSlice(&attrs), opts,
                           component_handle, done);
       }
     });
@@ -775,11 +745,17 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   }
   TF_RETURN_IF_ERROR(group.as_summary_status());
 
+  std::vector<core::RefCountPtr<FunctionRecord>> function_records;
+  for (const auto& pair : *subgraphs) {
+    ComponentFunctionData* comp_data = &data->glue_[pair.first];
+    function_records.push_back(data_lib_def->FindRecord(comp_data->name));
+  }
+
   *handle = AddMultiDeviceHandle(std::move(data), function_key);
   VLOG(1) << "Instantiated MultiDevice function \"" << function_name
           << "\" with handle " << *handle;
 
-  PublishSubgraphs(function_name, std::move(subgraphs));
+  PublishSubgraphs(function_name, std::move(function_records));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 8721fb448ed2eb..a34bfac4d5148f 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -16,23 +16,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include <functional>
+#include <memory>
 #include <optional>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
-#include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
-#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tsl/platform/notification.h"
 #include "tsl/platform/thread_annotations.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -84,11 +84,6 @@ class ProcessFunctionLibraryRuntime {
     // since the flr_map_ may have already been deleted. Explicitly releasing
     // flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this.
     flr_map_.reset();
-    // Graph and stats publishers might have pending work in async threads that
-    // requires access to PFLR instance. Wait for completion before destructing.
-    for (const auto& n : stats_publisher_completed_) {
-      n->WaitForNotification();
-    }
   }
 
   // Sends `tensors_to_send` from `source_device` to `target_device` using
@@ -270,6 +265,8 @@ class ProcessFunctionLibraryRuntime {
   struct ComponentFunctionData {
     // The handle for the instantiated component function.
     FunctionLibraryRuntime::Handle handle;
+    // The name for the component function.
+    string name;
     // arg_indices.size() is the number of arguments to the component function.
     // The i-th argument of the component function comes from the
     // `arg_indices[i]`-th argument of the multi-device function.
@@ -452,8 +449,7 @@ class ProcessFunctionLibraryRuntime {
 
   void PublishSubgraphs(
       const std::string& function_name,
-      std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>
-          subgraphs);
+      std::vector<core::RefCountPtr<FunctionRecord>>&& function_records);
 
   // Data structure holding information for a single instantiated remote
   // (to be executed on `target_device`) function.
@@ -545,8 +541,6 @@ class ProcessFunctionLibraryRuntime {
   // instantiated function.
   std::vector<std::unique_ptr<StatsPublisherInterface>> stats_publishers_
       TF_GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<tsl::Notification>> stats_publisher_completed_
-      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/request_cost.h b/tensorflow/core/common_runtime/request_cost.h
index c67827014663eb..ce4e5cc92845c2 100644
--- a/tensorflow/core/common_runtime/request_cost.h
+++ b/tensorflow/core/common_runtime/request_cost.h
@@ -50,6 +50,8 @@ class RequestCost {
     int64_t input_size = 0;
     // In this batch, the padding amount.
     int64_t padding_size = 0;
+    // Costs for processing this batch.
+    absl::flat_hash_map<std::string, absl::Duration> batch_costs;
   };
 
   // Records the metrics of a batch.
diff --git a/tensorflow/core/common_runtime/request_cost_test.cc b/tensorflow/core/common_runtime/request_cost_test.cc
index 17f8820c7cd3cf..052f1eef600f0b 100644
--- a/tensorflow/core/common_runtime/request_cost_test.cc
+++ b/tensorflow/core/common_runtime/request_cost_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::FieldsAre;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
 
@@ -53,13 +55,26 @@ TEST(RequestCostTest, RecordBatchMetrics) {
   RequestCost request_cost;
 
   request_cost.RecordBatchMetrics(RequestCost::BatchMetrics{
-      /*processed_size=*/8, /*input_size=*/8, /*padding_size=*/0});
+      /*processed_size=*/8,
+      /*input_size=*/8,
+      /*padding_size=*/0,
+      {{"gcu", absl::Milliseconds(80)}, {"tpu", absl::Milliseconds(160)}}});
   request_cost.RecordBatchMetrics(RequestCost::BatchMetrics{
-      /*processed_size=*/4, /*input_size=*/2, /*padding_size=*/1});
+      /*processed_size=*/4,
+      /*input_size=*/2,
+      /*padding_size=*/1,
+      {{"gcu", absl::Milliseconds(40)}, {"tpu", absl::Milliseconds(80)}}});
 
-  EXPECT_THAT(request_cost.GetBatchMetrics(),
-              testing::ElementsAre(testing::FieldsAre(8, 8, 0),
-                                   testing::FieldsAre(4, 2, 1)));
+  EXPECT_THAT(
+      request_cost.GetBatchMetrics(),
+      ElementsAre(
+          FieldsAre(8, 8, 0,
+                    UnorderedElementsAre(Pair("gcu", absl::Milliseconds(80)),
+                                         Pair("tpu", absl::Milliseconds(160)))),
+          FieldsAre(
+              4, 2, 1,
+              UnorderedElementsAre(Pair("gcu", absl::Milliseconds(40)),
+                                   Pair("tpu", absl::Milliseconds(80))))));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.cc b/tensorflow/core/common_runtime/stats_publisher_interface.cc
index 3dab626b9f1fc5..8b04ac9f80523d 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.cc
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/refcount.h"
 
 namespace tensorflow {
 namespace {
@@ -37,6 +39,9 @@ class NoOpStatsPublisher : public StatsPublisherInterface {
 
   void PublishGraphProto(std::vector<GraphDef> graph_defs) override {}
 
+  void PublishGraphProto(std::vector<core::RefCountPtr<FunctionRecord>>&&
+                             function_records) override {}
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(
       uint64 step, int64_t execution_count, const RunOptions& ropts) override {
     return nullptr;
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index 6f771068101ec7..450683e643dc0c 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -52,6 +53,8 @@ class StatsPublisherInterface {
   virtual void PublishGraphProto(
       const std::vector<const GraphDef*>& graph_defs) = 0;
   virtual void PublishGraphProto(std::vector<GraphDef> graph_defs) = 0;
+  virtual void PublishGraphProto(
+      std::vector<core::RefCountPtr<FunctionRecord>>&& function_records) = 0;
 
   // Returns a profile handler for the given step based on the execution_count
   // and RunOptions.
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 268c7d3ae2c000..75109d868b28bf 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -1005,7 +1005,7 @@ REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<50>,
 REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("reduce_array_record_dataset_memory_usage",
-                            RandomJobSamplePercentage<50>, AllTasks);
+                            RandomJobSamplePercentage<0>, AllTasks);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 47a12d0f670670..ee93b3bc916f20 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -1100,6 +1100,7 @@ cc_library(
         "//tensorflow/core/data/service/snapshot:snapshot_stream_writer",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index eddb40fa994cc4..def2d20de6a2f0 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -351,10 +351,10 @@ DataServiceClient::CreateAlternativeWorkerClientWithGrpcFallback(
               << task_info.worker_address() << "'.";
     return worker;
   }
-  LOG(WARNING) << "Failed to start client for data transfer protocol '"
-               << transfer_server.protocol() << "' for worker '"
-               << task_info.worker_address() << "'; falling back to grpc. "
-               << "Original error: " << worker.status();
+  LOG(INFO) << "Failed to start client for data transfer protocol '"
+            << transfer_server.protocol() << "' for worker '"
+            << task_info.worker_address() << "'; falling back to grpc. "
+            << "Original error: " << worker.status();
   metrics::RecordTFDataServiceDataTransferProtocolFallback(
       transfer_server.protocol(),
       static_cast<error::Code>(worker.status().raw_code()),
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 1c4cf2b83d26c2..ce1a3feed067fd 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "grpcpp/create_channel.h"
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -53,12 +54,14 @@ limitations under the License.
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_to_from_proto.h"
 #include "tsl/platform/statusor.h"
@@ -407,16 +410,17 @@ DataServiceWorkerImpl::MakeDataset(const DatasetDef& dataset_def,
   TF_ASSIGN_OR_RETURN(bool compression_disabled_at_runtime,
                       DisableCompressionAtRuntime(task_def.dataset_id()));
   GraphDef graph = dataset_def.graph();
+  if (VLOG_IS_ON(1)) {
+    std::string prefix = absl::StrCat(task_def.dataset_id(), "_", worker_uid_);
+    DumpGraphDefToFile(absl::StrCat(prefix, "-prerewrite_GraphDef"), graph);
+    DumpProtoToFile(absl::StrCat(prefix, "-prerewrite_TaskDef"), task_def);
+  }
   if (compression_disabled_at_runtime) {
     RemoveCompressionMapRewriter remove_compression_map_rewriter;
-    VLOG(2) << "Applying compression map rewrite. GraphDef: "
-            << graph.DebugString();
     TF_ASSIGN_OR_RETURN(
         graph, remove_compression_map_rewriter.ApplyRemoveCompressionMapRewrite(
                    graph));
   }
-  VLOG(2) << "Applying autoshard rewrite. TaskDef: " << task_def.DebugString()
-          << ", GraphDef: " << graph.DebugString();
   TF_ASSIGN_OR_RETURN(AutoShardRewriter auto_shard_rewriter,
                       AutoShardRewriter::Create(task_def));
   // `ApplyAutoShardRewrite` does nothing if auto-sharding is disabled.
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 5ffae5557fe914..95139f3f40dc4b 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1475,12 +1475,16 @@ Status FunctionLibraryDefinition::CopyFunctionDefFrom(
           "Cannot copy function '", name,
           "' because a different function with the same name already "
           "exists.");
+    } else {
+      return OkStatus();
     }
+  } else if (other_record->finalized()) {
+    bool added;
+    mutex_lock l(mu_);
+    return AddHelper(other_record.get(), &added);
   } else {
-    TF_RETURN_IF_ERROR(
-        AddFunctionDef(other_record->fdef(), other_record->stack_traces()));
+    return AddFunctionDef(other_record->fdef(), other_record->stack_traces());
   }
-  return OkStatus();
 }
 
 Status FunctionLibraryDefinition::AddGradientDef(const GradientDef& grad) {
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index ee19fc3f5f0e13..ec624d8e85e511 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -182,7 +182,7 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64_t>& sizes,
 }
 
 namespace internal {
-void SetTensorProtoShape(std::vector<size_t> shape,
+void SetTensorProtoShape(const absl::Span<const size_t> shape,
                          TensorShapeProto* shape_proto) {
   for (auto dim : shape) {
     shape_proto->mutable_dim()->Add()->set_size(dim);
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 6e09b8ee93d619..213025d274bd01 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -66,7 +66,7 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64_t>& sizes,
              std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
 namespace internal {
-void SetTensorProtoShape(std::vector<size_t> shape,
+void SetTensorProtoShape(absl::Span<const size_t> shape,
                          TensorShapeProto* shape_proto);
 
 template <typename Type>
@@ -265,31 +265,58 @@ class TensorProtoHelper<string> : public std::true_type {
   }
 };
 
-}  // namespace internal
-
-// Creates a 'TensorProto' with specified shape and values.
-// The dtype and a field to represent data values of the returned 'TensorProto'
-// are determined based on type of the 'values' parameter.
-template <typename Type>
+template <typename Type, typename IterType>
 typename std::enable_if<internal::TensorProtoHelper<Type>::value,
                         TensorProto>::type
-CreateTensorProto(const std::vector<Type>& values,
-                  const std::vector<size_t>& shape) {
+CreateTensorProto(IterType values_begin, IterType values_end,
+                  const size_t values_size,
+                  const absl::Span<const size_t> shape) {
   TensorProto tensor;
   TensorShapeProto tensor_shape_proto;
   internal::SetTensorProtoShape(shape, &tensor_shape_proto);
-  if (TensorShape(tensor_shape_proto).num_elements() != values.size()) {
-    LOG(ERROR) << "Shape and number of values (" << values.size()
+  if (TensorShape(tensor_shape_proto).num_elements() != values_size) {
+    LOG(ERROR) << "Shape and number of values (" << values_size
                << ") are incompatible.";
     return tensor;
   }
   using TypeHelper = internal::TensorProtoHelper<Type>;
   tensor.set_dtype(TypeHelper::GetDataType());
-  tensor.mutable_tensor_shape()->Swap(&tensor_shape_proto);
-  TypeHelper::AddValues(values.begin(), values.end(), &tensor);
+  *tensor.mutable_tensor_shape() = std::move(tensor_shape_proto);
+  TypeHelper::AddValues(values_begin, values_end, &tensor);
   return tensor;
 }
 
+}  // namespace internal
+
+// Creates a 'TensorProto' with the specified shape and values. The dtype and a
+// field to represent data values of the returned 'TensorProto' are determined
+// based on Type. Note that unless the argument provided to `values` is already
+// an absl::Span, `Type` will need to be provided as a template parameter--the
+// compiler can't infer it:
+//   auto proto = CreateTensorProtoSpan<float>(my_array, shape);
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProtoSpan(const absl::Span<const Type> values,
+                      const absl::Span<const size_t> shape) {
+  return internal::CreateTensorProto<Type>(values.begin(), values.end(),
+                                           values.size(), shape);
+}
+
+// Version of the above that's more convenient if `values` is an std::vector, in
+// which case Type can automatically be inferred:
+//   auto proto = CreateTensorProto(my_vector, shape);
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProto(const std::vector<Type>& values,
+                  const absl::Span<const size_t> shape) {
+  // This awkward iterator passing is essentially just to support vector<bool>,
+  // otherwise we could just represent the vector as a Span.
+  return internal::CreateTensorProto<Type>(values.begin(), values.end(),
+                                           values.size(), shape);
+}
+
 // Converts values in tensor to run-length encoded compressed form.
 //
 // The elements of a tensor can be stored in a TensorProto in one of the
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 1777ed4f4b12cc..51aa40e9048a6a 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -294,6 +294,34 @@ TEST(TensorUtil, ConcatSplitStrings) {
   }
 }
 
+TEST(TensorProtoUtil, CreateTensorProtoSpan_string) {
+  // Don't use vector to trigger Span version.
+  string s[2] = {"a", "b"};
+  std::vector<size_t> shape{1, 2};
+  auto proto = tensor::CreateTensorProtoSpan<string>(s, shape);
+  TensorProto expected_tensor_proto;
+  expected_tensor_proto.set_dtype(DT_STRING);
+  expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
+  expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(2);
+  expected_tensor_proto.add_string_val("a");
+  expected_tensor_proto.add_string_val("b");
+  EXPECT_EQ(proto.DebugString(), expected_tensor_proto.DebugString());
+}
+
+TEST(TensorProtoUtil, CreateTensorProtoSpan_int32) {
+  // Don't use vector to trigger Span version.
+  int32 s[2] = {123, 456};
+  std::vector<size_t> shape{1, 2};
+  auto proto = tensor::CreateTensorProtoSpan<int32>(s, shape);
+  TensorProto expected_tensor_proto;
+  expected_tensor_proto.set_dtype(DT_INT32);
+  expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
+  expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(2);
+  expected_tensor_proto.add_int_val(123);
+  expected_tensor_proto.add_int_val(456);
+  EXPECT_EQ(proto.DebugString(), expected_tensor_proto.DebugString());
+}
+
 TEST(TensorProtoUtil, CreatesStringTensorProto) {
   std::vector<string> values{"a", "b", "c"};
   std::vector<size_t> shape{1, 3};
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 689185fb08923d..2eed9cd40061a9 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -932,7 +932,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
-    test::ExpectClose(tensors_expected[i], tensors[i], -1, 4e-4);
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 2e-4);
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 84d94db5c65273..7469b180c2fb8d 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -687,6 +687,7 @@ cc_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/utils:topological_sort",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 1ce431a4f622bd..30d427a99e764a 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -29,11 +31,53 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kMapDatasetOp[] = "MapDataset";
+constexpr char kParallelMapDatasetOp[] = "ParallelMapDatasetV2";
+constexpr char kDeterministicAttr[] = "deterministic";
+constexpr char kConstOp[] = "Const";
+constexpr char kValueAttr[] = "value";
+constexpr int kAutotuneValue = -1;
+
+// Returns true if it is a `tf.data.AUTOTUNE` node.
+bool IsAutotuneNode(const string& node_name, const MutableGraphView& graph) {
+  const NodeDef* node = graph.GetNode(node_name);
+  if (!node) return false;
+  if (node->op() != kConstOp) return false;
+
+  const auto* value = gtl::FindOrNull(node->attr(), kValueAttr);
+  if (!value) return false;
+
+  if (value->has_tensor()) {
+    if (value->tensor().int64_val_size()) {
+      return value->tensor().int64_val(0) == kAutotuneValue;
+    }
+  }
+
+  return false;
+}
+
+// Returns true if both parent and child parallel map nodes have same
+// `determistic` attr value.
+bool SameDeterministicAttr(const NodeDef& parallel_map_node,
+                           const NodeDef& parent_parallel_map_node) {
+  const auto* first_deterministic_val =
+      gtl::FindOrNull(parallel_map_node.attr(), kDeterministicAttr);
+  const auto* second_deterministic_val =
+      gtl::FindOrNull(parent_parallel_map_node.attr(), kDeterministicAttr);
+
+  if (first_deterministic_val && second_deterministic_val) {
+    return first_deterministic_val->s() == second_deterministic_val->s();
+  }
+
+  return false;
+}
+
 // Sets basic function parameters and copies attributes from parent and map
 // node.
 NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
@@ -41,8 +85,15 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
                       MutableGraphView* graph) {
   NodeDef fused_node;
   graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
-  fused_node.set_op("MapDataset");
-  fused_node.add_input(parent_map_node.input(0));
+
+  if (map_node.op() == kMapDatasetOp) {
+    fused_node.set_op(kMapDatasetOp);
+    fused_node.add_input(parent_map_node.input(0));  // `input_dataset`
+  } else if (map_node.op() == kParallelMapDatasetOp) {
+    fused_node.set_op(kParallelMapDatasetOp);
+    fused_node.add_input(parent_map_node.input(0));  // `input_dataset`
+    fused_node.add_input(parent_map_node.input(1));  // `num_parallel_calls`
+  }
 
   auto attr = parent_map_node.attr().at("f");
   *attr.mutable_func()->mutable_name() = fused_function.signature().name();
@@ -74,6 +125,10 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
 
   graph_utils::MaybeSetFusedMetadata(parent_map_node, map_node, &fused_node);
 
+  if (map_node.op() == kParallelMapDatasetOp) {
+    graph_utils::CopyAttribute(kDeterministicAttr, map_node, &fused_node);
+  }
+
   return fused_node;
 }
 
@@ -92,10 +147,17 @@ Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
-  auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
+  auto get_map_node = [&graph](const NodeDef& node) -> const NodeDef* {
     // TODO(b/148614504): Support ParallelMapDataset and MapAndBatchDataset.
     // TODO(b/148614315): Support captured inputs.
-    if (node.op() == "MapDataset" && node.input_size() == 1) return &node;
+    if (node.op() == kMapDatasetOp && node.input_size() == 1) return &node;
+    // Only parallel map with no captured inputs (empty `other_arguments`) and
+    // parallelism set to "AUTOTUNE" would be eligible for rewrite.
+    if (node.op() == kParallelMapDatasetOp) {
+      if (node.input_size() != 2) return nullptr;
+      if (!IsAutotuneNode(node.input(1), graph)) return nullptr;
+      return &node;
+    }
     return nullptr;
   };
 
@@ -129,6 +191,15 @@ Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
         get_map_node(*graph_utils::GetInputNode(*map_node, graph));
     if (!parent_map_node) continue;
 
+    // TODO(b/148614504): Support fusing different types of map operations.
+    if (parent_map_node->op() != map_node->op()) continue;
+
+    // TODO(b/148614504): Support fusing parallel map operations with different
+    // `deterministic` attr values.
+    if (map_node->op() == kParallelMapDatasetOp) {
+      if (!SameDeterministicAttr(*parent_map_node, *map_node)) continue;
+    }
+
     const auto* fused_function = make_fused_function(parent_map_node, map_node);
     if (fused_function == nullptr) continue;
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
index 8889f9dddd3b46..45690e3b7bfc56 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
@@ -15,13 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
 
+#include <functional>
+#include <memory>
+
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -30,6 +31,27 @@ namespace grappler {
 namespace {
 
 using graph_tests_utils::MakeMapNode;
+using graph_tests_utils::MakeParallelMapV2Node;
+
+constexpr char kConstOpName[] = "Const";
+
+NodeDef CreateScalarConstNodeHelper(
+    const std::string& node_name, DataType dtype,
+    const std::function<void(TensorProto*)>& add_value) {
+  NodeDef node;
+  node.set_op(kConstOpName);
+  node.set_name(node_name);
+
+  (*node.mutable_attr())["dtype"].set_type(dtype);
+  auto tensor = std::make_unique<tensorflow::TensorProto>();
+  auto tensor_shape = std::make_unique<tensorflow::TensorShapeProto>();
+  tensor->set_allocated_tensor_shape(tensor_shape.release());
+  tensor->set_dtype(dtype);
+  add_value(tensor.get());
+  (*node.mutable_attr())["value"].set_allocated_tensor(tensor.release());
+
+  return node;
+}
 
 TEST(MapFusionTest, FuseTwoMapNodesIntoOne) {
   using test::function::NDef;
@@ -79,6 +101,35 @@ TEST(MapFusionTest, FuseThreeNodesIntoOne) {
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map3", output));
 }
 
+TEST(MapFusionTest, FuseTwoParallelMapNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  NodeDef num_parallel_calls_node = CreateScalarConstNodeHelper(
+      "num_parallel_calls", DT_INT64,
+      [](TensorProto* proto) { proto->add_int64_val(-1); });
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       num_parallel_calls_node,
+       MakeParallelMapV2Node("map1", "range", num_parallel_calls_node.name(),
+                             "XTimesTwo", "default"),
+       MakeParallelMapV2Node("map2", "map1", num_parallel_calls_node.name(),
+                             "XTimesTwo", "default")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MapFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/inference/BUILD b/tensorflow/core/grappler/optimizers/inference/BUILD
index e96fd3bbc8ef53..c9ccd64d169926 100644
--- a/tensorflow/core/grappler/optimizers/inference/BUILD
+++ b/tensorflow/core/grappler/optimizers/inference/BUILD
@@ -15,11 +15,12 @@ package(
     licenses = ["notice"],
 )
 
+# Expand the DEFAULT_VISIBILITY so that we can replace with public visibility with copybara.
 tf_proto_library(
     name = "batch_op_rewriter_proto",
     srcs = ["batch_op_rewriter.proto"],
     cc_api_version = 2,
-    visibility = DEFAULT_VISIBILITY,
+    visibility = ["//visibility:public"],
 )
 
 # copybara:uncomment_begin(google-only)
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index c1c7d39955fcde..3c49cf57ca4bef 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -366,6 +366,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/util:incremental_barrier",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index dcbc994df3b4b9..5724c5cdbc19b6 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
@@ -1096,15 +1097,21 @@ void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
 
   // 2. Records the batch metrics in each task.
   const int64_t padding_size = processed_size - batch.size();
+  absl::flat_hash_map<std::string, absl::Duration> batch_costs;
+  for (const auto& batch_cost_measurement : batch_cost_measurements) {
+    if (batch_cost_measurement->GetTotalCost() > absl::ZeroDuration()) {
+      batch_costs[batch_cost_measurement->GetCostType()] =
+          batch_cost_measurement->GetTotalCost();
+    }
+  }
   for (int i = 0; i < batch.num_tasks(); i++) {
     RequestCost* request_cost = batch.task(i).request_cost;
     // Skip recording the metrics if the request_cost is null.
     if (!request_cost) continue;
 
     request_cost->RecordBatchMetrics(RequestCost::BatchMetrics{
-        /*processed_size=*/processed_size,
-        /*input_size=*/static_cast<int64_t>(batch.task(i).size()),
-        /*padding_size=*/padding_size});
+        processed_size, static_cast<int64_t>(batch.task(i).size()),
+        padding_size, batch_costs});
   }
 }
 
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index b7d4d6d3cb1dd7..18385670d2dabb 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -75,10 +75,10 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnNoCostMeasurement) {
                                                      /*processed_size=*/16,
                                                      batch);
   EXPECT_TRUE(batch.task(0).request_cost->GetCosts().empty());
-  EXPECT_THAT(
-      batch.task(0).request_cost->GetBatchMetrics(),
-      ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/16, /*input_size=*/1, /*padding_size=*/15)));
+  EXPECT_THAT(batch.task(0).request_cost->GetBatchMetrics(),
+              ::testing::ElementsAre(::testing::FieldsAre(
+                  /*processed_size=*/16, /*input_size=*/1, /*padding_size=*/15,
+                  ::testing::IsEmpty())));
 }
 
 TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnZeroCost) {
@@ -95,10 +95,10 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnZeroCost) {
                                                      /*processed_size=*/16,
                                                      batch);
   EXPECT_TRUE(batch.task(0).request_cost->GetCosts().empty());
-  EXPECT_THAT(
-      batch.task(0).request_cost->GetBatchMetrics(),
-      ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/16, /*input_size=*/1, /*padding_size=*/15)));
+  EXPECT_THAT(batch.task(0).request_cost->GetBatchMetrics(),
+              ::testing::ElementsAre(::testing::FieldsAre(
+                  /*processed_size=*/16, /*input_size=*/1, /*padding_size=*/15,
+                  ::testing::IsEmpty())));
 }
 
 TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnZeroBatchSize) {
@@ -154,7 +154,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitSingleCostType) {
   EXPECT_THAT(
       batch.task(0).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100))))));
   EXPECT_THAT(
       batch.task(1).request_cost->GetCosts(),
       UnorderedElementsAre(Pair("test_tpu_with_smear", absl::Milliseconds(90)),
@@ -162,7 +163,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitSingleCostType) {
   EXPECT_THAT(
       batch.task(1).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100))))));
 }
 
 TEST(SplitBatchCostsAndRecordMetricsTest, SplitMultiCostTypes) {
@@ -191,7 +193,9 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitMultiCostTypes) {
   EXPECT_THAT(
       batch.task(0).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100)),
+                               Pair("test_gcu", absl::Milliseconds(200))))));
 
   EXPECT_THAT(
       batch.task(1).request_cost->GetCosts(),
@@ -202,7 +206,9 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitMultiCostTypes) {
   EXPECT_THAT(
       batch.task(1).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100)),
+                               Pair("test_gcu", absl::Milliseconds(200))))));
 }
 
 TEST(SplitBatchCostsAndRecordMetricsTest, SplitOnlyNonZeroCostTypes) {
@@ -229,7 +235,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitOnlyNonZeroCostTypes) {
   EXPECT_THAT(
       batch.task(0).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/1, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100))))));
 
   EXPECT_THAT(
       batch.task(1).request_cost->GetCosts(),
@@ -238,7 +245,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitOnlyNonZeroCostTypes) {
   EXPECT_THAT(
       batch.task(1).request_cost->GetBatchMetrics(),
       ::testing::ElementsAre(::testing::FieldsAre(
-          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10)));
+          /*processed_size=*/20, /*input_size=*/9, /*padding_size=*/10,
+          UnorderedElementsAre(Pair("test_tpu", absl::Milliseconds(100))))));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
index ff4933dcfd38be..454e7a77b32037 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
@@ -742,7 +742,8 @@ void LaunchConvBackpropFilterOpImpl(
     OP_REQUIRES_OK(context, stream->ThenBlasGemm(
                                 se::blas::Transpose::kNoTranspose,
                                 se::blas::Transpose::kTranspose, n, m, k, a_ptr,
-                                n, b_ptr, m, &c_ptr, n, GetNumericOptions()));
+                                n, b_ptr, m, &c_ptr, n, GetNumericOptions(),
+                                se::blas::CallContext::kNone));
     return;
   } else if (!is_grouped_convolution &&
              dims.filter_size(0) == dims.input_size(0) &&
@@ -764,7 +765,8 @@ void LaunchConvBackpropFilterOpImpl(
     OP_REQUIRES_OK(context, stream->ThenBlasGemm(
                                 se::blas::Transpose::kNoTranspose,
                                 se::blas::Transpose::kTranspose, n, m, k, b_ptr,
-                                n, a_ptr, m, &c_ptr, n, GetNumericOptions()));
+                                n, a_ptr, m, &c_ptr, n, GetNumericOptions(),
+                                se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
index 499a961828bef0..1c1472ead96faa 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
@@ -264,7 +264,8 @@ void LaunchConv2DBackpropFilterOpImpl(
     OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
                                              se::blas::Transpose::kTranspose, n,
                                              m, k, a_ptr, n, b_ptr, m, &c_ptr,
-                                             n, GetNumericOptions()));
+                                             n, GetNumericOptions(),
+                                             se::blas::CallContext::kNone));
     return;
   } else if (dims.spatial_dims[0].filter_size ==
                  dims.spatial_dims[0].input_size &&
@@ -289,7 +290,8 @@ void LaunchConv2DBackpropFilterOpImpl(
     OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
                                              se::blas::Transpose::kTranspose, n,
                                              m, k, b_ptr, n, a_ptr, m, &c_ptr,
-                                             n, GetNumericOptions()));
+                                             n, GetNumericOptions(),
+                                             se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index da7b3909bac05b..5a21304bc724f8 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -162,7 +162,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
 
     OP_REQUIRES_OK(
         ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k,
-                                  a_ptr, k, &c_ptr, n, GetNumericOptions()));
+                                  a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                                  se::blas::CallContext::kNone));
     return;
   } else if (dims.spatial_dims[0].filter_size ==
                  dims.spatial_dims[0].input_size &&
@@ -189,7 +190,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
 
     OP_REQUIRES_OK(
         ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k,
-                                  a_ptr, k, &c_ptr, n, GetNumericOptions()));
+                                  a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                                  se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
index 4bf223d09a746d..b877cade6a772d 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
@@ -743,7 +743,8 @@ void LaunchConvBackpropInputOpImpl(
 
     OP_REQUIRES_OK(context, stream->ThenBlasGemm(transpose, no_transpose, n, m,
                                                  k, b_ptr, k, a_ptr, k, &c_ptr,
-                                                 n, GetNumericOptions()));
+                                                 n, GetNumericOptions(),
+                                                 se::blas::CallContext::kNone));
     return;
   } else if (!is_grouped_convolution &&
              dims.filter_size(0) == dims.input_size(0) &&
@@ -767,7 +768,8 @@ void LaunchConvBackpropInputOpImpl(
 
     OP_REQUIRES_OK(context, stream->ThenBlasGemm(transpose, no_transpose, n, m,
                                                  k, b_ptr, k, a_ptr, k, &c_ptr,
-                                                 n, GetNumericOptions()));
+                                                 n, GetNumericOptions(),
+                                                 se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_ops_impl.h b/tensorflow/core/kernels/conv_ops_impl.h
index 6a3773bb176679..432093b48e3d5d 100644
--- a/tensorflow/core/kernels/conv_ops_impl.h
+++ b/tensorflow/core/kernels/conv_ops_impl.h
@@ -805,9 +805,10 @@ void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
                                 output->template flat<T>().size());
 
     auto no_transpose = se::blas::Transpose::kNoTranspose;
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(
-                                no_transpose, no_transpose, n, m, k, b_ptr, n,
-                                a_ptr, k, &c_ptr, n, GetNumericOptions()));
+    OP_REQUIRES_OK(context, stream->ThenBlasGemm(no_transpose, no_transpose, n,
+                                                 m, k, b_ptr, n, a_ptr, k,
+                                                 &c_ptr, n, GetNumericOptions(),
+                                                 se::blas::CallContext::kNone));
     return;
   } else if (!is_grouped_convolution && filter_same_dims && padding == VALID &&
              data_format == FORMAT_NHWC) {
@@ -826,9 +827,10 @@ void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
                                 output->template flat<T>().size());
 
     auto no_transpose = se::blas::Transpose::kNoTranspose;
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(
-                                no_transpose, no_transpose, n, m, k, b_ptr, n,
-                                a_ptr, k, &c_ptr, n, GetNumericOptions()));
+    OP_REQUIRES_OK(context, stream->ThenBlasGemm(no_transpose, no_transpose, n,
+                                                 m, k, b_ptr, n, a_ptr, k,
+                                                 &c_ptr, n, GetNumericOptions(),
+                                                 se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index aa24c0122ad463..2ae28298d81f27 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -931,7 +931,12 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     void CurrentWorkerThread(std::shared_ptr<IteratorContext> ctx)
         TF_LOCKS_EXCLUDED(mu_) {
       RecordStart(ctx.get());
+      std::shared_ptr<Element> element;
       auto done = [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Release the shared ownership so that
+        // the iterator managed by `element` is guaranteed destroyed
+        // before this class instance members.
+        element.reset();
         RecordStop(ctx.get());
         DecrementActiveWorkers();
         DecrementCurrentActiveWorkers();
@@ -940,7 +945,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       };
       while (true) {
         int element_index;
-        std::shared_ptr<Element> element;
+        element.reset();
         // Find an element to process.
         {
           mutex_lock l(*mu_);
@@ -1000,12 +1005,16 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     void FutureWorkerThread(std::shared_ptr<IteratorContext> ctx)
         TF_LOCKS_EXCLUDED(mu_) {
       RecordStart(ctx.get());
+      std::shared_ptr<Element> element;
       auto done = [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Release the shared ownership so that
+        // the iterator managed by `element` is guaranteed destroyed
+        // before this class instance members.
+        element.reset();
         RecordStop(ctx.get());
         DecrementActiveWorkers();
         DecrementOutstandingThreads();
       };
-      std::shared_ptr<Element> element;
       while (true) {
         {
           mutex_lock l(*mu_);
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.h b/tensorflow/core/kernels/fused_eigen_output_kernels.h
index 9f8902a51b1b4a..c264925055286f 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.h
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -26,8 +26,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
 #define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
 
-#include <type_traits>
-
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -105,22 +103,6 @@ struct Relu6 {
   };
 };
 
-// Applies `Tanh` to the passed input expression.
-struct Tanh {
-  template <typename XprType>
-  static auto apply(XprType expr) -> decltype(expr.tanh()) {
-    return expr.tanh();
-  };
-};
-
-// Applies `Sigmoid` to the passed input expression.
-struct Sigmoid {
-  template <typename XprType>
-  static auto apply(XprType expr) -> decltype(expr.sigmoid()) {
-    return expr.sigmoid();
-  };
-};
-
 // Applies `Elu` to the passed input expression.
 struct Elu {
   template <typename XprType>
@@ -160,8 +142,6 @@ struct BiasAddArgs {
     return fusion == FusedComputationType::kBiasAdd ||
            fusion == FusedComputationType::kBiasAddWithRelu ||
            fusion == FusedComputationType::kBiasAddWithRelu6 ||
-           fusion == FusedComputationType::kBiasAddWithTanh ||
-           fusion == FusedComputationType::kBiasAddWithSigmoid ||
            fusion == FusedComputationType::kBiasAddWithElu ||
            fusion == FusedComputationType::kBiasAddWithLeakyRelu;
   }
@@ -239,15 +219,10 @@ struct BiasAddOutputKernel {
     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
-      Scalar* output_base = &output_mapper(0, col);
-      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
-      if constexpr (std::is_same_v<Scalar, T>) {
-        const auto expr = output + bias;
-        output = Activation::template apply<decltype(expr)>(expr);
-      } else {
-        const auto expr = output + bias.template cast<Scalar>();
-        output = Activation::template apply<decltype(expr)>(expr);
-      }
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = Activation::template apply<decltype(expr)>(expr);
     }
   }
 
@@ -271,17 +246,10 @@ struct BiasAddOutputKernel<T, LeakyRelu> {
     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
-      Scalar* output_base = &output_mapper(0, col);
-      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
-      if constexpr (std::is_same_v<Scalar, T>) {
-        const auto expr = output + bias;
-        output =
-            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
-      } else {
-        const auto expr = output + bias.template cast<Scalar>();
-        output =
-            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
-      }
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
     }
   }
 
@@ -388,10 +356,6 @@ using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
 template <typename T>
 using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
 template <typename T>
-using WithBiasAddAndTanh = BiasAddOutputKernel<T, Tanh>;
-template <typename T>
-using WithBiasAddAndSigmoid = BiasAddOutputKernel<T, Sigmoid>;
-template <typename T>
 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
 template <typename T>
 using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index f937a06016dc8c..872aa9247bcb51 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -32,20 +32,15 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include <functional>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "Eigen/Core"  // from @eigen_archive
-#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_eigen_output_kernels.h"
 #include "tensorflow/core/platform/errors.h"
@@ -90,16 +85,14 @@ struct LaunchFusedMatMulOp {
 
 template <typename T>
 struct LaunchFusedMatMulOp<CPUDevice, T> {
-  // Use F32 compute for F16 inputs on CPU to preserve precision and reduce
-  // excessive casting during intermediate computations.
-  using ComputeType =
-      std::conditional_t<DataTypeToEnum<T>::value == DT_HALF, float, T>;
-
   void operator()(
       OpKernelContext* context, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       FusedComputationType fusion, const FusedComputationArgs& fusion_args,
       Tensor* output, bool use_autotune) {
+    OP_REQUIRES(context, DataTypeToEnum<T>::value != DT_HALF,
+                errors::InvalidArgument("_FusedMatMul doesn't support DT_HALF "
+                                        "data type on CPU devices."));
     auto lhs = a.matrix<T>();
     auto rhs = b.matrix<T>();
     auto out = output->matrix<T>();
@@ -111,21 +104,13 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
     auto executeWithOutputKernel = [&](auto output_kernel) {
       OutputKernelWrapper output_kernel_wrapper(
           [&output_kernel](
-              const ContractionOutputMapper<ComputeType, Eigen::Index>&
-                  output_mapper,
+              const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
               const Eigen::TensorContractionParams& params, Eigen::Index i,
               Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
             output_kernel(output_mapper, params, i, j, num_rows, num_cols);
           });
 
-      if constexpr (std::is_same_v<ComputeType, T>) {
-        out.device(d) = lhs.contract(rhs, dim_pair, output_kernel_wrapper);
-      } else {
-        out.device(d) = lhs.template cast<ComputeType>()
-                            .contract(rhs.template cast<ComputeType>(),
-                                      dim_pair, output_kernel_wrapper)
-                            .template cast<T>();
-      }
+      out.device(d) = lhs.contract(rhs, dim_pair, output_kernel_wrapper);
     };
 
     BiasAddArgs<T> bias_add_args;
@@ -148,12 +133,6 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
       case FusedComputationType::kBiasAddWithRelu6:
         executeWithOutputKernel(WithBiasAddAndRelu6<T>(bias_add_args));
         break;
-      case FusedComputationType::kBiasAddWithTanh:
-        executeWithOutputKernel(WithBiasAddAndTanh<T>(bias_add_args));
-        break;
-      case FusedComputationType::kBiasAddWithSigmoid:
-        executeWithOutputKernel(WithBiasAddAndSigmoid<T>(bias_add_args));
-        break;
       case FusedComputationType::kBiasAddWithElu:
         executeWithOutputKernel(WithBiasAddAndElu<T>(bias_add_args));
         break;
@@ -176,16 +155,16 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
   // We do not pass std::function directly as an output kernel because it blows
   // up the binary size in debug mode with super long symbol names.
   struct OutputKernelWrapper {
-    using OutputKernelFn = std::function<void(
-        const ContractionOutputMapper<ComputeType, Eigen::Index>&,
-        const Eigen::TensorContractionParams&, Eigen::Index, Eigen::Index,
-        Eigen::Index, Eigen::Index)>;
+    using OutputKernelFn =
+        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
+                           const Eigen::TensorContractionParams&, Eigen::Index,
+                           Eigen::Index, Eigen::Index, Eigen::Index)>;
 
     explicit OutputKernelWrapper(OutputKernelFn fn)
         : output_kernel_fn(std::move(fn)) {}
 
     void operator()(
-        const ContractionOutputMapper<ComputeType, Eigen::Index>& output_mapper,
+        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
         const Eigen::TensorContractionParams& params, Eigen::Index i,
         Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
       output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
@@ -632,8 +611,6 @@ class FusedMatMulOp : public OpKernel {
           {FCT::kBiasAdd, {"BiasAdd"}},
           {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
           {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
-          {FCT::kBiasAddWithTanh, {"BiasAdd", "Tanh"}},
-          {FCT::kBiasAddWithSigmoid, {"BiasAdd", "Sigmoid"}},
           {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
           {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
       };
@@ -734,7 +711,6 @@ class FusedMatMulOp : public OpKernel {
       FusedMatMulOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_FUSED_CPU_MATMUL);
-TF_CALL_half(REGISTER_FUSED_CPU_MATMUL);
 
 #undef REGISTER_FUSED_CPU_MATMUL
 
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index 6fc9926478c09b..536e204a7eeb96 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -713,7 +713,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                     static_cast<Coefficient>(1.0), b_ptrs,
                     adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
                     static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator)
+                    GetNumericOptions(), &scratch_allocator,
+                    se::blas::CallContext::kNone)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -807,20 +808,22 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           }
         }
 
-        OP_REQUIRES_OK(context, stream->ThenBlasGemm(
-                                    blas_transpose_b, blas_transpose_a, n, m, k,
-                                    *(b_ptrs[0]), adj_y || trans_y ? k : n,
-                                    *(a_ptrs[0]), adj_x || trans_x ? m : k,
-                                    c_ptrs[0], n, GetNumericOptions()));
+        OP_REQUIRES_OK(context,
+                       stream->ThenBlasGemm(
+                           blas_transpose_b, blas_transpose_a, n, m, k,
+                           *(b_ptrs[0]), adj_y || trans_y ? k : n, *(a_ptrs[0]),
+                           adj_x || trans_x ? m : k, c_ptrs[0], n,
+                           GetNumericOptions(), se::blas::CallContext::kNone));
       } else if (use_strided_batched) {
         OP_REQUIRES_OK(
-            context, stream->ThenBlasGemmStridedBatched(
-                         blas_transpose_b, blas_transpose_a, n, m, k,
-                         static_cast<Coefficient>(1.0), *b_ptrs[0],
-                         adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
-                         adj_x || trans_x ? m : k, a_stride,
-                         static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
-                         batch_size, GetNumericOptions()));
+            context,
+            stream->ThenBlasGemmStridedBatched(
+                blas_transpose_b, blas_transpose_a, n, m, k,
+                static_cast<Coefficient>(1.0), *b_ptrs[0],
+                adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
+                adj_x || trans_x ? m : k, a_stride,
+                static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
+                batch_size, GetNumericOptions(), se::blas::CallContext::kNone));
       } else {
         BlasScratchAllocator scratch_allocator(context);
         bool blas_launch_status =
@@ -830,7 +833,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                     static_cast<Coefficient>(1.0), b_ptrs,
                     adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
                     static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator)
+                    GetNumericOptions(), &scratch_allocator,
+                    se::blas::CallContext::kNone)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -848,32 +852,6 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename T>
-inline void FastConvertToFloat(const T* src, float* dst, int64_t size) {
-  Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>> src_eigen(src, size);
-  Eigen::Map<Eigen::ArrayXf> dst_eigen(dst, size);
-  dst_eigen = src_eigen.template cast<float>();
-}
-
-template <typename T>
-inline void FastConvertFromFloat(const float* src, T* dst, int64_t size) {
-  Eigen::Map<const Eigen::ArrayXf> src_eigen(src, size);
-  Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>> dst_eigen(dst, size);
-  dst_eigen = src_eigen.template cast<T>();
-}
-
-template <>
-inline void FastConvertToFloat<bfloat16>(const bfloat16* src, float* dst,
-                                         int64_t size) {
-  BFloat16ToFloat(src, dst, size);
-}
-
-template <>
-inline void FastConvertFromFloat<bfloat16>(const float* src, bfloat16* dst,
-                                           int64_t size) {
-  FloatToBFloat16(src, dst, size);
-}
-
 template <typename Device, typename Ta, typename Tb, typename Tout>
 class BaseBatchMatMulOp : public OpKernel {
  public:
@@ -955,9 +933,8 @@ class BaseBatchMatMulOp : public OpKernel {
                 out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
                 errors::Internal("Failed to reshape output from ",
                                  out->shape().DebugString()));
-    if constexpr (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, Tb> &&
-                  (std::is_same_v<Ta, bfloat16> ||
-                   std::is_same_v<Ta, Eigen::half>)) {
+    if (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, bfloat16> &&
+        std::is_same_v<Tb, bfloat16>) {
       Tensor in0_reshaped_float, in1_reshaped_float, out_reshaped_float;
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in0_reshaped.shape(),
                                              &in0_reshaped_float));
@@ -967,19 +944,18 @@ class BaseBatchMatMulOp : public OpKernel {
                                              &out_reshaped_float));
 
       // TODO: Avoid extra copy to make bfloat16 matmul efficient on CPU.
-      FastConvertToFloat(in0_reshaped.flat<Ta>().data(),
-                         in0_reshaped_float.flat<float>().data(),
-                         in0_reshaped.NumElements());
-      FastConvertToFloat(in1_reshaped.flat<Tb>().data(),
-                         in1_reshaped_float.flat<float>().data(),
-                         in1_reshaped.NumElements());
+      BFloat16ToFloat(in0_reshaped.flat<bfloat16>().data(),
+                      in0_reshaped_float.flat<float>().data(),
+                      in0_reshaped.NumElements());
+      BFloat16ToFloat(in1_reshaped.flat<bfloat16>().data(),
+                      in1_reshaped_float.flat<float>().data(),
+                      in1_reshaped.NumElements());
 
       LaunchBatchMatMul<Device, float>::Launch(
           ctx, in0_reshaped_float, in1_reshaped_float, adj_x_, adj_y_, trans_x_,
           trans_y_, bcast, &out_reshaped_float);
-      FastConvertFromFloat<Tout>(out_reshaped_float.flat<float>().data(),
-                                 out_reshaped.flat<Tout>().data(),
-                                 out->NumElements());
+      FloatToBFloat16(out_reshaped_float.flat<float>().data(),
+                      out_reshaped.flat<bfloat16>().data(), out->NumElements());
     } else {
       // Cast tensor to desired type to reuse Eigen.
       // TODO(b/178749687): remove this cast if Eigen supports this natively.
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 96c37ac97817b8..9d4276c39c2d10 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -416,7 +416,12 @@ REGISTER_TYPED_TEST_SUITE_P(FusedMatMulWithBiasOpTest,       //
                             MatMul1x256x1WithActivation);
 
 // TODO(ezhulenev): Add support for more data types.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 using FusedBiasAddDataTypes = ::testing::Types<float, Eigen::half>;
+#else
+// CPU doesn't support more data types.
+using FusedBiasAddDataTypes = ::testing::Types<float>;
+#endif
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedMatMulWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.cc b/tensorflow/core/kernels/rnn/blas_gemm.cc
index b83de9f75201ae..96c9cde2a74f1d 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.cc
+++ b/tensorflow/core/kernels/rnn/blas_gemm.cc
@@ -54,7 +54,7 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
       ctx, ctx->op_device_context()->stream()->ThenBlasGemm(
                trans[transa], trans[transb], m, n, k, static_cast<T>(alpha),
                a_ptr, lda, b_ptr, ldb, static_cast<T>(beta), &c_ptr, ldc,
-               GetNumericOptions()));
+               GetNumericOptions(), se::blas::CallContext::kNone));
 #else
   ctx->SetStatus(errors::InvalidArgument("CuBlasGemm needs CUDA."));
 #endif
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 7dfbd5ea26c0f6..3590422f92d5b5 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -932,11 +932,14 @@ cc_library(
         "//tensorflow/core/profiler/convert/trace_viewer:trace_events_util",
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
         "@local_tsl//tsl/profiler/utils:timespan",
+        "@local_tsl//tsl/profiler/utils:trace_utils",
         "@local_tsl//tsl/profiler/utils:xplane_schema",
+        "@local_tsl//tsl/profiler/utils:xplane_utils",
         "@local_tsl//tsl/profiler/utils:xplane_visitor",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
index 67b311e78001b3..53f94371d1a96d 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h"
@@ -27,13 +28,17 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
 #include "tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tsl/profiler/utils/timespan.h"
+#include "tsl/profiler/utils/trace_utils.h"
 #include "tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/utils/xplane_utils.h"
 #include "tsl/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using tsl::profiler::FindPlanesWithPrefix;
+using tsl::profiler::FindPlaneWithName;
 using tsl::profiler::HostEventType;
 using tsl::profiler::StatType;
 using tsl::profiler::XEventVisitor;
@@ -185,12 +190,10 @@ void ConvertXLineToTraceEventsContainer(uint32_t device_id,
   });
 }
 
-}  // namespace
-
-void ConvertXPlaneToTraceEventsContainer(absl::string_view hostname,
+void ConvertXPlaneToTraceEventsContainer(uint64_t device_id,
+                                         absl::string_view hostname,
                                          const XPlane& xplane,
                                          TraceEventsContainer* container) {
-  uint64_t device_id = xplane.id();
   XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
   std::unique_ptr<ResourceGrouperInterface> resource_grouper =
       CreateDefaultResourceGrouper(device_id, plane.Name());
@@ -211,11 +214,35 @@ void ConvertXPlaneToTraceEventsContainer(absl::string_view hostname,
   });
 }
 
+}  // namespace
+
 void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
                                          const XSpace& space,
                                          TraceEventsContainer* container) {
-  for (const auto& plane : space.planes()) {
-    ConvertXPlaneToTraceEventsContainer(hostname, plane, container);
+  const XPlane* host_plane =
+      FindPlaneWithName(space, tsl::profiler::kHostThreadsPlaneName);
+  if (host_plane != nullptr) {
+    ConvertXPlaneToTraceEventsContainer(tsl::profiler::kHostThreadsDeviceId,
+                                        hostname, *host_plane, container);
+  }
+
+  std::vector<const XPlane*> device_planes =
+      FindPlanesWithPrefix(space, tsl::profiler::kGpuPlanePrefix);
+
+  if (device_planes.empty()) {
+    device_planes = FindPlanesWithPrefix(space, tsl::profiler::kTpuPlanePrefix);
+  }
+
+  for (const XPlane* device_plane : device_planes) {
+    ConvertXPlaneToTraceEventsContainer(
+        tsl::profiler::kFirstDeviceId + device_plane->id(), hostname,
+        *device_plane, container);
+  }
+  for (const XPlane* custom_plane :
+       FindPlanesWithPrefix(space, tsl::profiler::kCustomPlanePrefix)) {
+    ConvertXPlaneToTraceEventsContainer(
+        tsl::profiler::kCustomPlaneDeviceId + custom_plane->id(), hostname,
+        *custom_plane, container);
   }
 }
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 2da8be25e6461e..6d019c46b8d68d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1658  // Updated: 2023/10/23
+#define TF_GRAPH_DEF_VERSION 1665  // Updated: 2023/10/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index b415e93f495bdf..2b719d921bae97 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -196,7 +196,7 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
     tfrt::ResourceContext* client_graph_resource_context,
     OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array,
-    const tensorflow::tfrt_stub::FallbackState& fallback_state,
+    tensorflow::tfrt_stub::FallbackState& fallback_state,
     const tensorflow::ProcessFunctionLibraryRuntime&
         process_function_library_runtime,
     CostRecorder* cost_recorder) {
@@ -279,7 +279,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
     tfrt::ResourceContext* client_graph_resource_context,
     OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array, const Runtime& runtime,
-    const FallbackState& fallback_state,
+    FallbackState& fallback_state,
     const tensorflow::ProcessFunctionLibraryRuntime&
         process_function_library_runtime,
     tfrt::RequestDeadlineTracker* req_deadline_tracker,
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index df764718dd8981..f3c0ef5f920f9c 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -87,8 +87,7 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
     tfrt::ResourceContext* resource_context,
     tfrt::ResourceContext* client_graph_resource_context,
     OpKernelRunnerTable* runner_table,
-    tfd::FallbackResourceArray* resource_array,
-    const FallbackState& fallback_state,
+    tfd::FallbackResourceArray* resource_array, FallbackState& fallback_state,
     const ProcessFunctionLibraryRuntime& process_function_library_runtime,
     CostRecorder* cost_recorder = nullptr);
 
@@ -110,7 +109,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
     tfrt::ResourceContext* client_graph_resource_context,
     OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array, const Runtime& runtime,
-    const FallbackState& fallback_state,
+    FallbackState& fallback_state,
     const tensorflow::ProcessFunctionLibraryRuntime&
         process_function_library_runtime,
     tfrt::RequestDeadlineTracker* req_deadline_tracker,
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index e9e5a79b33f450..52e875750698c6 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/base/optimization.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
@@ -383,6 +384,13 @@ REGISTER_KERNEL_BUILDER(
     Name(kMlrtBatchFunctionName).Device(DEVICE_CPU),
     tfrt_stub::BatchFunctionFallbackKernel<MlrtBatchResource>);
 
+// TFRT does not depend on the device annotation.
+// MLRT Batch function will not actually execute on GPU, but rather on CPU.
+// This kernel is registered on accelerator to get through the check.
+REGISTER_KERNEL_BUILDER(
+    Name(kMlrtBatchFunctionName).Device(DEVICE_GPU),
+    tfrt_stub::BatchFunctionFallbackKernel<MlrtBatchResource>);
+
 // Identical to BatchFunction except it has 2 extra TFRT attributes and it does
 // not have `f` attribute. Users will not invoke this op directly.
 REGISTER_OP(kMlrtBatchFunctionName)
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index 1c2250813f8f78..aa53c65ba28625 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -66,6 +66,12 @@ class ModelRuntimeContext {
     flib_def_ = flib_def;
   }
 
+  bool is_local_session() const { return is_local_session_; }
+
+  void set_is_local_session(bool is_local_session) {
+    is_local_session_ = is_local_session;
+  }
+
   tfrt::ResourceContext& resource_context() { return *resource_context_; }
 
   const GraphExecutionOptions& graph_execution_options() const {
@@ -81,6 +87,8 @@ class ModelRuntimeContext {
   tfrt::ResourceContext* resource_context_ = nullptr;
 
   FunctionLibraryDefinition* flib_def_ = nullptr;
+
+  bool is_local_session_ = false;
 };
 
 // This defines the runtime abstraction in tensorflow for TFRT. It is supposed
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 6830567a3a4da8..19157905d64753 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -45,6 +45,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:import_model",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -62,8 +63,9 @@ cc_library(
         "//tensorflow/core/tfrt/graph_executor",
         "//tensorflow/core/tfrt/graph_executor:export_mlir",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/runtime",
-        "//tensorflow/core/tfrt/saved_model/utils:serialize_bef_utils",
+        "//tensorflow/core/tfrt/saved_model/utils:serialize_utils",
         "//tensorflow/core/tfrt/utils",
         "//tensorflow/core/tpu:virtual_device",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -135,7 +137,7 @@ cc_library(
         "//tensorflow/core/tfrt/mlrt/kernel:batch_kernel",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
-        "//tensorflow/core/tfrt/saved_model/utils:serialize_bef_utils",
+        "//tensorflow/core/tfrt/saved_model/utils:serialize_utils",
         "//tensorflow/core/tfrt/stubs:model_config_stub",
         "//tensorflow/core/tfrt/utils",
         "//tensorflow/core/tfrt/utils:error_util",
@@ -280,7 +282,7 @@ cc_library(
         "//tensorflow/core/tfrt/graph_executor",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
         "//tensorflow/core/tfrt/runtime",
-        "//tensorflow/core/tfrt/saved_model/utils:serialize_bef_utils",
+        "//tensorflow/core/tfrt/saved_model/utils:serialize_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/tensorflow/core/tfrt/saved_model/python/BUILD b/tensorflow/core/tfrt/saved_model/python/BUILD
index 517f1b2e4e7770..eaa54fd5bb88ce 100644
--- a/tensorflow/core/tfrt/saved_model/python/BUILD
+++ b/tensorflow/core/tfrt/saved_model/python/BUILD
@@ -34,25 +34,6 @@ package_group(
 # )
 # copybara:uncomment_end
 
-tf_python_pybind_extension(
-    name = "_pywrap_saved_model_aot_compile",
-    srcs = ["saved_model_aot_compile_wrapper.cc"],
-    enable_stub_generation = True,
-    module_name = "_pywrap_saved_model_aot_compile",
-    pytype_srcs = [
-        "_pywrap_saved_model_aot_compile.pyi",
-    ],
-    deps = [
-        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
-        "//tensorflow/core/tfrt/runtime",
-        "//tensorflow/core/tfrt/saved_model:saved_model_aot_compile",
-        "//tensorflow/python/lib/core:pybind11_lib",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
-        "@pybind11_abseil//pybind11_abseil:status_casters",
-    ],
-)
-
 cc_library(
     name = "saved_model_load_and_run",
     srcs = ["saved_model_load_and_run.cc"],
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
deleted file mode 100644
index 1dd62575fbfd98..00000000000000
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
-#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
-#include "tensorflow/core/tfrt/runtime/runtime.h"
-#include "tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h"
-#include "tensorflow/python/lib/core/pybind11_lib.h"
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_pywrap_saved_model_aot_compile, m) {
-  py::google::ImportStatusModule();
-
-  py::class_<tensorflow::tfrt_stub::AotOptions>(m, "AotOptions")
-      .def(py::init<>());
-  m.doc() = "pybind11 AotOptions Python - C++ Wrapper";
-
-  m.def("AotCompileSavedModel",
-        &tensorflow::tfrt_stub::AotCompileSavedModelAndSaveResult,
-        py::arg("input_model_dir") = absl::string_view(),
-        py::arg("aot_options") = tensorflow::tfrt_stub::AotOptions(),
-        py::arg("output_model_dir") = absl::string_view());
-  m.doc() = "pybind11 AotCompileSavedModel Python - C++ Wrapper";
-}
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index ca83901bd2a868..fc68e23424729b 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -76,7 +76,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
-#include "tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h"
 #include "tensorflow/core/tfrt/stubs/model_config_stub.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
@@ -136,8 +135,7 @@ tensorflow::Status RunBytecodeInitializers(
     const InitializersAndSignatures& initializers_and_signatures,
     const mlrt::LoadedExecutable& loaded_executable,
     tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
-    tfd::FallbackResourceArray* resource_array,
-    const FallbackState& fallback_state) {
+    tfd::FallbackResourceArray* resource_array, FallbackState& fallback_state) {
   TF_ASSIGN_OR_RETURN(
       auto request_info,
       CreateRequestInfo(options, /*run_options=*/{},
@@ -182,8 +180,7 @@ tensorflow::Status RunBefInitializers(
     const InitializersAndSignatures& initializers_and_signatures,
     tfrt::BEFFile* bef_file, tfrt::ResourceContext* resource_context,
     OpKernelRunnerTable* runner_table,
-    tfd::FallbackResourceArray* resource_array,
-    const FallbackState& fallback_state) {
+    tfd::FallbackResourceArray* resource_array, FallbackState& fallback_state) {
   DCHECK(options.runtime);
   TF_ASSIGN_OR_RETURN(
       auto request_info,
@@ -557,11 +554,16 @@ SavedModelImpl::LoadSavedModel(Options options,
   tfrt::BefBuffer bef;
   if (AotPackageExists(saved_model_dir)) {
     LOG(INFO) << "Found AoT package. Load and deserialize BEF.";
+    if (options.graph_execution_options.enable_mlrt) {
+      // TODO(b/303504882): Add deserialization for mlrt path
+      return absl::InternalError("AOT is not supported in MLRT");
+    } else {
+      ASSIGN_OR_RETURN_IN_COMPILE(
+          bef, LoadBefAndMlir(options.graph_execution_options.compile_options,
+                              mlir_module.get(), saved_model_dir_string,
+                              fallback_state.get()));
+    }
 
-    ASSIGN_OR_RETURN_IN_COMPILE(
-        bef, LoadAotPackages(options.graph_execution_options.compile_options,
-                             mlir_module.get(), saved_model_dir_string,
-                             fallback_state.get()));
   } else {
     tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
     tensorflow::tf_mlrt::RegisterTfMlrtBatchKernels(*kernel_registry);
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index fc02ed98c1ad21..af6a558c89ad78 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -28,6 +28,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -58,9 +62,10 @@ limitations under the License.
 #include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
-#include "tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h"
+#include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
 #include "tensorflow/core/tpu/virtual_device.h"
 #include "tsl/platform/casts.h"
@@ -225,7 +230,7 @@ Status AotCompileSavedModelAndSaveResult(absl::string_view input_model_dir,
         .serialize_mlir_module_to_aot_packages = true;
     graph_execution_options.compile_options.aot_mlir_module_file =
         io::JoinPath(aot_directory, kMLIRModuleFilename);
-
+    graph_execution_options.enable_mlrt = false;
     aot_options.graph_execution_options =
         std::make_shared<GraphExecutionOptions>(graph_execution_options);
   }
@@ -252,10 +257,18 @@ Status AotCompileSavedModelAndSaveResult(absl::string_view input_model_dir,
         "saved_model not found in input directory: ", input_model_dir));
   }
 
-  // Serialize BEF buffer to a file under aot_packages
-  const std::string serialized_bef_path =
-      io::JoinPath(aot_directory, kBefBufferFilenameMLIRBEF);
-  TF_RETURN_IF_ERROR(SerializeBEF(result.bef, serialized_bef_path));
+  // Serialize BEF/MLRT buffer to file.
+  if (aot_options.graph_execution_options->enable_mlrt) {
+    const std::string serialized_mlrt_path =
+        io::JoinPath(aot_directory, kMlrtBufferFileName);
+    TF_RETURN_IF_ERROR(SerializeMLRTBytecode(
+        std::get<mlrt::bc::Buffer>(result.buffer), serialized_mlrt_path));
+  } else {
+    const std::string serialized_bef_path =
+        io::JoinPath(aot_directory, kBefBufferFileName);
+    TF_RETURN_IF_ERROR(SerializeBEF(std::get<tfrt::BefBuffer>(result.buffer),
+                                    serialized_bef_path));
+  }
 
   if (pb_found) {
     const std::string output_file_directory =
@@ -322,11 +335,30 @@ StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
 
   tfrt::BefBuffer bef;
   std::vector<std::string> xla_function_names;
-  RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
-      aot_options.graph_execution_options->compile_options, mlir_module.get(),
-      &bef, model_context, fallback_state.get(), &xla_function_names));
-  if (bef.empty()) {
-    return absl::InternalError("BefBuffer is empty.");
+
+  mlrt::bc::Buffer bytecode_buffer;
+  if (aot_options.graph_execution_options->enable_mlrt) {
+    mlir::OwningOpRef<mlir::ModuleOp> module_with_op_keys;
+
+    ASSIGN_OR_RETURN_IN_COMPILE(
+        bytecode_buffer,
+        tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
+            aot_options.graph_execution_options->compile_options,
+            *fallback_state, mlir_module.get(), model_context,
+            &module_with_op_keys, &xla_function_names));
+
+    if (bytecode_buffer.empty()) {
+      LOG(ERROR) << "MLRT byte buffer is empty.";
+      return absl::InternalError("bytecode_buffer is empty.");
+    }
+  } else {
+    RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
+        aot_options.graph_execution_options->compile_options, mlir_module.get(),
+        &bef, model_context, fallback_state.get(), &xla_function_names));
+    if (bef.empty()) {
+      LOG(ERROR) << "BEF byte buffer is empty.";
+      return absl::InternalError("BefBuffer is empty.");
+    }
   }
 
   const FunctionLibraryDefinition& flib_def = fallback_state->func_lib_def();
@@ -340,7 +372,9 @@ StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
     }
     xla_functions.push_back(*xla_func_def);
   }
-
+  if (aot_options.graph_execution_options->enable_mlrt) {
+    return AotResult{std::move(bytecode_buffer), std::move(xla_functions)};
+  }
   return AotResult{std::move(bef), std::move(xla_functions)};
 }
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
index ec68b15532e5f6..7176324da7f7bf 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
@@ -49,7 +51,7 @@ struct AotResult {
   using ExecutableMap =
       absl::flat_hash_map<DeviceCompilationClusterSignature, std::string,
                           DeviceCompilationClusterSignature::Hash>;
-  tfrt::BefBuffer bef;
+  std::variant<tfrt::BefBuffer, mlrt::bc::Buffer> buffer;
   // TODO(b/296466237): Investigate whether the whole FunctionDefLibrary should
   // be put here.
   // XLA cluster functions corresponding to `XlaLaunch` op, generated during
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.cc b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
index 6cdbe4fa29270e..4788b040c17c67 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
@@ -53,7 +53,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_import_input.h"
-#include "tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h"
+#include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
@@ -236,14 +236,14 @@ std::string GetAotPackagePath(absl::string_view saved_model_dir) {
 
 std::string GetBEFFilePath(std::string aot_package_directory) {
   return tsl::io::JoinPath(aot_package_directory,
-                           std::string(kBefBufferFilenameMLIRBEF));
+                           std::string(kBefBufferFileName));
 }
 
 std::string GetMlirFilePath(const std::string& aot_package_directory) {
   return tsl::io::JoinPath(aot_package_directory, kMLIRModuleFilename);
 }
 
-absl::StatusOr<tfrt::BefBuffer> LoadAotPackages(
+absl::StatusOr<tfrt::BefBuffer> LoadBefAndMlir(
     const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
     const std::string& saved_model_dir,
     tfrt_stub::FallbackState* fallback_state) {
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.h b/tensorflow/core/tfrt/saved_model/saved_model_util.h
index a3a3c09ee5c5a0..a3474869ab6572 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.h
@@ -47,7 +47,10 @@ namespace tensorflow {
 namespace tfrt_stub {
 
 // Filename for serialized BEF Buffer.
-inline constexpr char kBefBufferFilenameMLIRBEF[] = "serialized_bef.mlir.bef";
+inline constexpr char kBefBufferFileName[] = "serialized_bef.mlir.bef";
+
+// Filename for serialized MLRT bytecode Buffer.
+inline constexpr char kMlrtBufferFileName[] = "serialized_mlrt.mlir.mlrt";
 
 // Filename for serialized MLIR_MODULE.
 inline constexpr char kMLIRModuleFilename[] = "serialized_mlir.mlir";
@@ -123,7 +126,7 @@ std::string GetMlirFilePath(const std::string& aot_package_directory);
 
 // TODO(b/295241000): Implement MLIR deserialization to skip it AoT and remove
 // redundant steps
-absl::StatusOr<tfrt::BefBuffer> LoadAotPackages(
+absl::StatusOr<tfrt::BefBuffer> LoadBefAndMlir(
     const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
     const std::string& saved_model_dir,
     tfrt_stub::FallbackState* fallback_state);
diff --git a/tensorflow/core/tfrt/saved_model/utils/BUILD b/tensorflow/core/tfrt/saved_model/utils/BUILD
index a216250f1c0b5b..ef2e9e29abc3b8 100644
--- a/tensorflow/core/tfrt/saved_model/utils/BUILD
+++ b/tensorflow/core/tfrt/saved_model/utils/BUILD
@@ -17,40 +17,45 @@ package_group(
 )
 
 cc_library(
-    name = "serialize_bef_utils",
-    srcs = ["serialize_bef_utils.cc"],
-    hdrs = ["serialize_bef_utils.h"],
+    name = "serialize_utils",
+    srcs = ["serialize_utils.cc"],
+    hdrs = ["serialize_utils.h"],
     deps = [
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:env",
         "@tf_runtime//:bef",
-        "@tf_runtime//:befexecutor",
     ],
 )
 
 tf_cc_shared_test(
-    name = "serialize_bef_utils_test",
-    srcs = ["serialize_bef_utils_test.cc"],
+    name = "serialize_utils_test",
+    srcs = ["serialize_utils_test.cc"],
     data = [
         "//tensorflow/compiler/mlir/tfrt/tests/saved_model:testdata",
     ],
     tags = ["no_oss"],
     deps = [
-        ":serialize_bef_utils",
+        ":serialize_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tfrt:import_model",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:import_model",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "//tensorflow/core/tfrt/utils",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@tf_runtime//:bef",
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.cc b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
similarity index 76%
rename from tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.cc
rename to tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
index b4564bd457f90f..eb624457f27e7f 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.cc
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h"
+#include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 
 #include <memory>
 #include <string>
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tsl/platform/env.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
@@ -49,5 +50,17 @@ absl::StatusOr<tfrt::BefBuffer> DeserializeBEFBuffer(
   return bef;
 }
 
+absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &bytecode,
+                                   const std::string &filepath) {
+  std::string errorMessage;
+  auto output = mlir::openOutputFile(filepath, &errorMessage);
+  (output->os())
+      .write(reinterpret_cast<const char *>(bytecode.data()), bytecode.size());
+  output->keep();
+  LOG(INFO) << "Completed serializing MLRTBytecode to: " << filepath;
+
+  return absl::OkStatus();
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
similarity index 71%
rename from tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h
rename to tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
index f359a8bc63ed71..a0258518cfc6d1 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_BEF_UTILS_H_
-#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_BEF_UTILS_H_
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
 
 #include <memory>
 #include <string>
@@ -25,20 +25,26 @@ limitations under the License.
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 #include "tsl/platform/env.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_stub {
 
-// Serializes the BefBuffer into a file
+// Serializes the BefBuffer into a file.
 absl::Status SerializeBEF(const tfrt::BefBuffer &bef,
                           const std::string &filepath);
 
-// Deserializes BEF file from filepath into a BEFBuffer
+// Deserializes BEF file from filepath into a BEFBuffer.
 absl::StatusOr<tfrt::BefBuffer> DeserializeBEFBuffer(
     const std::string &filepath);
+
+// Serializes the MLRTBytecodeBuffer into a file.
+absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &byteCode,
+                                   const std::string &filepath);
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_BEF_UTILS_H_
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_UTILS_SERIALIZE_UTILS_H_
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils_test.cc b/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
similarity index 61%
rename from tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils_test.cc
rename to tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
index 3850e9355fd395..9ca6b53712a380 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils_test.cc
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
@@ -13,18 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/tfrt/saved_model/utils/serialize_bef_utils.h"
+#include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 
 #include <memory>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -87,6 +91,51 @@ TEST(SerializeBEFTest, HandlesCompleteProcess) {
                    *default_options.graph_execution_options.runtime, bef)
                    .status());
 }
+
+TEST(SerializeMLRTTest, HandlesSerializeProcess) {
+  // Create Empty MLRT Bytecode
+  // tfrt::BefBuffer old_bef;
+  mlrt::bc::Buffer old_byteCode;
+
+  // Load MLRT Bytecode Data
+
+  const std::string saved_model_mlir_path =
+      "third_party/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
+      "test.mlir";
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  auto module =
+      mlir::parseSourceFile<mlir::ModuleOp>(saved_model_mlir_path, &context);
+  ASSERT_TRUE(module);
+  mlir::OwningOpRef<mlir::ModuleOp> module_with_op_keys;
+  std::unique_ptr<Runtime> runtime =
+      tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/1);
+  tfrt_stub::GraphExecutionOptions options(runtime.get());
+  options.enable_mlrt = true;
+  tfrt::ResourceContext resource_context;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tfrt_stub::FallbackState> fallback_state,
+      tfrt_stub::FallbackState::Create(SessionOptions(), FunctionDefLibrary()));
+  tfrt_stub::ModelRuntimeContext model_context(
+      &options, options.compile_options.saved_model_dir, &resource_context);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer, mlrt_compiler::ConvertTfMlirToBytecode(
+                       options.compile_options, *fallback_state, module.get(),
+                       model_context, &module_with_op_keys));
+
+  // Create Filepath for .mlir.mlrt
+  const std::string filepath =
+      io::JoinPath(getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
+                   std::string("serialized_mlrt.mlir.mlrt"));
+
+  // Serialize MLRT Bytecode
+  TF_ASSERT_OK(
+      tensorflow::tfrt_stub::SerializeMLRTBytecode(old_byteCode, filepath));
+  // Check that MLRT Bytecode is not empty
+  ASSERT_NE(buffer.size(), 0);
+}
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 85c0ed3f790afd..6971c4f4e9ce5a 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
@@ -123,12 +124,8 @@ class TfrtSession : public tensorflow::Session {
         device_target_{device_target},
         tpu_use_tpu_runner_{tpu_use_tpu_runner},
         inter_op_thread_pools_{std::move(inter_op_thread_pools)},
-        model_metadata_(options.config.experimental().session_metadata()),
-        optimize_for_static_graph_(
-            options.config.experimental().optimize_for_static_graph()),
-        disable_optimize_for_static_graph_(
-            options.config.experimental().disable_optimize_for_static_graph()),
-        enable_mlrt_(enable_mlrt) {}
+        enable_mlrt_(enable_mlrt),
+        options_{options} {}
 
   Status Create(const GraphDef& graph) override {
     return Create(GraphDef(graph));
@@ -168,10 +165,11 @@ class TfrtSession : public tensorflow::Session {
     auto session_options =
         tensorflow::tfrt_stub::CreateDefaultSessionOptions(options);
     session_options.config.mutable_experimental()
-        ->set_optimize_for_static_graph(optimize_for_static_graph_);
+        ->set_optimize_for_static_graph(
+            options_.config.experimental().optimize_for_static_graph());
     session_options.config.mutable_experimental()
         ->set_disable_optimize_for_static_graph(
-            disable_optimize_for_static_graph_);
+            options_.config.experimental().disable_optimize_for_static_graph());
     LOG_FIRST_N(INFO, 10) << "SessionOptions: "
                           << session_options.config.DebugString();
 
@@ -191,6 +189,12 @@ class TfrtSession : public tensorflow::Session {
     auto resource_context = std::make_unique<tfrt::ResourceContext>();
     tfrt_stub::ModelRuntimeContext model_context(
         &options, /*export_dir=*/"unknown_export_dir", resource_context.get());
+    MetaGraphDef meta_graph_def;
+    *meta_graph_def.mutable_graph_def() = graph;
+    model_context.set_meta_graph_def(&meta_graph_def);
+    // TODO(b/300474723): Add functionality supporting Pathways initialization
+    // through TFRT Session.
+    model_context.set_is_local_session(true);
     TF_RETURN_IF_ERROR(options.runtime->CreateRuntimeResources(model_context));
 
     // `GraphExecutor::Create()` will preprocess the graph (e.g., apply
@@ -437,7 +441,7 @@ class TfrtSession : public tensorflow::Session {
     // implementation that supports the premapped memory optimization.
     compile_options.use_tpu_host_allocator_for_inputs = tpu_use_tpu_runner_;
 
-    options.model_metadata = model_metadata_;
+    options.model_metadata = options_.config.experimental().session_metadata();
     options.enable_mlrt = enable_mlrt_;
 
     return options;
@@ -477,10 +481,8 @@ class TfrtSession : public tensorflow::Session {
   absl::flat_hash_map<CallableHandle, Callable> callables_
       TF_GUARDED_BY(callables_lock_);
 
-  const tensorflow::SessionMetadata model_metadata_;
-  const bool optimize_for_static_graph_ = true;
-  const bool disable_optimize_for_static_graph_ = false;
   bool enable_mlrt_ = false;
+  SessionOptions options_ = SessionOptions();
 };
 
 std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 29534e3d5d5832..d6fed0212eaf77 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1434,8 +1434,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:stringpiece",
     ],
 )
diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout.cc b/tensorflow/core/tpu/kernels/sparse_core_layout.cc
index ff1bdcac063e9c..4b3a961c29e6b1 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/tpu/kernels/sparse_core_layout.pb.h"
-#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/stringpiece.h"
 
 namespace tensorflow {
@@ -103,16 +101,21 @@ absl::Status SparseCoreLayoutStacker::AddTable(tsl::StringPiece table_name,
     for (TableStack &ts : candidate_stacks) {
       // Make sure we haven't exceeded the maximum stack memory.
       if (activation_mem_bytes_limit_ != 0 &&
-          ts.total_activation_mem_bytes + activation_mem_bytes >
+          ts.total_activation_mem_bytes + activation_mem_bytes >=
               activation_mem_bytes_limit_) {
         continue;
       }
       if (variable_shard_bytes_limit_ != 0 &&
-          ts.total_variable_shard_bytes + variable_shard_bytes >
+          ts.total_variable_shard_bytes + variable_shard_bytes >=
               variable_shard_bytes_limit_) {
         continue;
       }
 
+      if (row_limit_ != 0 &&
+          ts.unsharded_height + padded_height >= row_limit_) {
+        continue;
+      }
+
       // We found a stack we can put it in.
       stack = &ts;
       break;
@@ -158,7 +161,6 @@ absl::Status SparseCoreLayoutStacker::AddTable(tsl::StringPiece table_name,
   stack->total_variable_shard_bytes += variable_shard_bytes;
   stack->total_activation_mem_bytes += activation_mem_bytes;
 
-  ++num_tables_;
   return absl::OkStatus();
 }
 
@@ -185,15 +187,6 @@ absl::StatusOr<SparseCoreTableLayouts> SparseCoreLayoutStacker::GetLayouts() {
         absl::StrAppend(&stacked_table_name, incomplete_layout.table_name());
       }
 
-      // If the table name is too long, shorten it and replace it with a hash.
-      // The stacked table name turns into a variable name, and for some
-      // systems, variable names that are too long can cause problems.
-      if (stacked_table_name.size() > 100) {
-        stacked_table_name = absl::StrCat(
-            stacked_table_name.substr(0, 80),
-            absl::StrFormat("_%x", tsl::Fingerprint64(stacked_table_name)));
-      }
-
       for (const SparseCoreTableLayout &incomplete_layout :
            stack.incomplete_tables) {
         SparseCoreTableLayout *out_layout = layouts.add_tables();
diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout.h b/tensorflow/core/tpu/kernels/sparse_core_layout.h
index 5ee44a113f1ac1..5c9c15dad532b8 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout.h
@@ -54,6 +54,10 @@ class SparseCoreLayoutStacker {
     CHECK(stacks_by_group_.empty()) << "must call before AddTable";
     stacking_enabled_ = stacking_enabled;
   }
+  void SetStackingRowLimit(int64_t row_limit) {
+    CHECK(stacks_by_group_.empty()) << "must call before AddTable";
+    row_limit_ = row_limit;
+  }
 
   // Add a new table.  Arguments:
   //   table_name: How this table will be referred to.
@@ -102,7 +106,9 @@ class SparseCoreLayoutStacker {
   bool stacking_enabled_ = true;
   int64_t activation_mem_bytes_limit_ = 0;
   int64_t variable_shard_bytes_limit_ = 0;
-  int num_tables_ = 0;
+  // Sparse core ops use signed int for row numbers so we had better not stack
+  // beyond this limit.
+  int64_t row_limit_ = (1LL << 31) - 1;
 
   // All the stacks that we currently know about. Note that we use a btree_map
   // rather than a flat_hash_map so the resulting order is deterministic as long
diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout_test.cc b/tensorflow/core/tpu/kernels/sparse_core_layout_test.cc
index aeba71e787b86d..a77a76229eec8a 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout_test.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout_test.cc
@@ -89,7 +89,7 @@ TEST(SparseCoreLayoutStacker, RespectsDisableStacking) {
 
 TEST(SparseCoreLayoutStacker, RespectsActivationMemLimit) {
   SparseCoreLayoutStacker stacker(2);
-  stacker.SetActivationMemoryBytesLimit(16384);
+  stacker.SetActivationMemoryBytesLimit(16384 + 1);
 
   // Here there are several identical tables with an activation memory limit of
   //    sizeof (float) * 8 * 1024 = 8192 per table.
@@ -111,7 +111,7 @@ TEST(SparseCoreLayoutStacker, RespectsActivationMemLimit) {
 
 TEST(SparseCoreLayoutStacker, RespectsVariableShardLimit) {
   SparseCoreLayoutStacker stacker(2);
-  stacker.SetVariableShardBytesLimit(4096);
+  stacker.SetVariableShardBytesLimit(4096 + 1);
 
   // Here there are several identical tables that contribute
   //    sizeof (float) * 8 * 128 / 2 = 2048 bytes to each shard.
@@ -131,6 +131,35 @@ TEST(SparseCoreLayoutStacker, RespectsVariableShardLimit) {
       )pb"))));
 }
 
+TEST(SparseCoreLayoutStacker, RespectsRowLimit) {
+  SparseCoreLayoutStacker stacker(2);
+  // Disable the other limits.
+  stacker.SetActivationMemoryBytesLimit(0);
+  stacker.SetVariableShardBytesLimit(0);
+
+  // Here there are several identical tables that contribute 2^30 rows. Since
+  // the default row limit is 2^31-1, they should not be able to stack.
+  ASSERT_OK(stacker.AddTable("table1", 1 << 29, 8, "stack1", 1024));
+  ASSERT_OK(stacker.AddTable("table2", 1 << 29, 8, "stack1", 1024));
+  ASSERT_OK(stacker.AddTable("table3", 1 << 29, 8, "stack1", 1024));
+  ASSERT_OK(stacker.AddTable("table4", 1 << 29, 8, "stack1", 1024));
+  EXPECT_THAT(stacker.GetLayouts(), IsOkAndHolds(Partially(EqualsProto(R"pb(
+                tables {
+                  table_name: 'table1'
+                  stacked_table_name: 'table1_table2_table3'
+                }
+                tables {
+                  table_name: 'table2'
+                  stacked_table_name: 'table1_table2_table3'
+                }
+                tables {
+                  table_name: 'table3'
+                  stacked_table_name: 'table1_table2_table3'
+                }
+                tables { table_name: 'table4' stacked_table_name: 'table4' }
+              )pb"))));
+}
+
 }  // namespace
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 53698e14b4dedf..54feed5d5fffe2 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "highway/hwy/base.h"  // from @com_google_highway
-#include "highway/hwy/contrib/sort/order.h"  // from @com_google_highway
-#include "highway/hwy/contrib/sort/vqsort.h"  // from @com_google_highway
+#include "hwy/base.h"  // from @com_google_highway
+#include "hwy/contrib/sort/order.h"  // from @com_google_highway
+#include "hwy/contrib/sort/vqsort.h"  // from @com_google_highway
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "xla/util.h"
diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
index 5bb259374fa0b3..b529d004d2540f 100644
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -186,7 +186,6 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/tpu/kernels:sparse_core_ops_utils",
         "@local_xla//xla:util",
     ],
     alwayslink = 1,
@@ -202,12 +201,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/core/tpu/ops:tpu_compile_op",
-        "//tensorflow/core/tpu/ops:tpu_embedding_ops",
-        "//tensorflow/core/tpu/ops:tpu_execute_op",
-        "//tensorflow/core/tpu/ops:tpu_handle_to_key_op",
-        "//tensorflow/core/tpu/ops:tpu_partitioned_ops",
-        "//tensorflow/core/tpu/ops:tpu_round_robin_op",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
index d25c9d1489ae3e..a6d1fa90dca82d 100644
--- a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
@@ -131,22 +130,9 @@ REGISTER_OP("GetMinibatchSplitsWithPhysicalReplica")
       c->set_output(0, c->UnknownShapeOfRank(1));
       c->set_output(1, c->UnknownShapeOfRank(1));
       c->set_output(2, c->UnknownShapeOfRank(1));
-      int32 num_replica;
-      TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
-
-      int32 num_sc_per_chip;
-      TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
-
-      const int max_division_level = GetMinibatchMaxDivisionLevel();
-
-      const int num_physical_replica = num_replica * num_sc_per_chip;
-
-      const int32 kMaxDivisions = 1 << max_division_level;
-
       c->set_output(3, c->Scalar());
-      c->set_output(
-          4, c->MakeShape(
-                 {num_physical_replica * kMaxDivisions * num_sc_per_chip + 1}));
+      // Depends on max division level, which is currently passed by flag.
+      c->set_output(4, c->UnknownShapeOfRank(1));
       c->set_output(5, c->Scalar());
       c->set_output(6, c->Scalar());
       return OkStatus();
diff --git a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
index df7312e0f7fcae..e40280554bd439 100644
--- a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
@@ -35,11 +35,7 @@ REGISTER_OP("TPUCopyWithDynamicShape")
         c->set_output(i, c->input(i));
       }
       return OkStatus();
-    })
-    .Doc(R"(
-Op that copies host tensor to device with dynamic shape support.
-For internal use only.
-)");
+    });
 
 REGISTER_OP("TPUAnnotateTensorsWithDynamicShape")
     .Input("tensors: T")
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 51fd1143fe7de5..e8e7cb6de4860f 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -270,4 +270,13 @@ string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
                     });
 }
 
+string DumpProtoToFile(const string& name,
+                       tensorflow::protobuf::Message const& proto,
+                       const string& dirname) {
+  return DumpToFile(name, dirname, ".pbtxt", proto.GetTypeName(),
+                    [&](WritableFile* file) {
+                      return WriteTextProtoToUniqueFile(proto, file);
+                    });
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
index 0a13bf2186740f..aea03d4af4fe3b 100644
--- a/tensorflow/core/util/dump_graph.h
+++ b/tensorflow/core/util/dump_graph.h
@@ -58,6 +58,12 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
                              const string& dirname = "");
 
+// Similar to DumpGraphDefToFile, but dumps a proto of any type. Returns the
+// file name chosen.
+string DumpProtoToFile(const string& name,
+                       tensorflow::protobuf::Message const& proto,
+                       const string& dirname = "");
+
 // Sets a custom Graph dumper. If set, this dumper will be used to dump graphs
 // instead via DumpGraphToFile. As the custom dumper may not produce protobufs,
 // allow specifying a file suffix/extension too.
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
index c7e510e05c534e..9942ba182b740a 100644
--- a/tensorflow/core/util/dump_graph_test.cc
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -60,5 +60,19 @@ TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
 }
 
+TEST(DumpGraph, DumpProtoToFileSuccess) {
+  NodeDef ndef_in;
+  ndef_in.set_name("foo");
+
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string expected_filepath = io::JoinPath(testing::TmpDir(), "node_def.pbtxt");
+  string actual_filepath = DumpProtoToFile("node_def", ndef_in);
+  EXPECT_EQ(expected_filepath, actual_filepath);
+
+  NodeDef ndef_out;
+  TF_ASSERT_OK(ReadTextProto(Env::Default(), expected_filepath, &ndef_out));
+  EXPECT_EQ(ndef_in.DebugString(), ndef_out.DebugString());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir b/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir
index c63ffda64f2627..dc4e6227ba184a 100644
--- a/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir
+++ b/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir
@@ -18,8 +18,8 @@ func.func @main_func1(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
 func.func @check_layouts_retvals_attached_in_layout_op() -> tensor<i32> {
   // CHECK-NOT:       "tf_device.cluster_func"()
   // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
-  // CHECK-SAME:      _layout = ["sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  // CHECK-SAME:      _layout = ["sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   %0 = "tf_device.cluster_func"() {func = @single_in_out, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> tensor<i32>
   func.return %0 : tensor<i32>
 }
@@ -33,8 +33,8 @@ func.func @single_in_out() -> (tensor<i32>) {
 func.func @check_layouts_retval_attached_with_multi_in_op(%arg0: tensor<i64>, %arg1: tensor<1xf32> {tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3" }, %arg2: tensor<1xf32> {tf._layout = "mesh:CPU,x=2,y=2 layout:scalar" }) -> tensor<1xf32> {
   // CHECK-NOT:       "tf_device.cluster_func"()
   // CHECK-NEXT:      %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"(%arg1, %arg2)
-  // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   %0 = "tf_device.cluster_func"(%arg1, %arg2) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", func = @multi_in} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   func.return %0 : tensor<1xf32>
 }
@@ -50,11 +50,11 @@ func.func @multi_in(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32>
 func.func @check_input_resource_layouts_attached_in_call_op() -> tensor<i32> {
   // CHECK-NOT:       "tf_device.cluster_func"()
   // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   // CHECK-SAME:      _inferred_resource_indices = dense<1> : vector<1xi32>
   // CHECK-SAME:      _inferred_resource_layouts
   // CHECK-SAME:      "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
-  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   %0 = "tf_device.cluster_func"() {func = @single_in_out, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", _inferred_resource_indices = dense<1> : vector<1xi32>,
     _inferred_resource_layouts = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
   func.return %0 : tensor<i32>
@@ -71,9 +71,9 @@ func.func @single_in_out() -> (tensor<i32>) {
 func.func @check_nested_stateful_partitioned_call() -> (tensor<i32>, tensor<i32>) {
   // CHECK-NOT:       "tf_device.cluster_func"()
   // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   // CHECK-SAME:      "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
-  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
   %0:2 = "tf_device.cluster_func"() {func = @nested_stateful_partitioned_call, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
   func.return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
@@ -100,9 +100,9 @@ func.func @nested_func() -> (tensor<i32>, tensor<i32>) {
 func.func @check_var_handle_op_skip_compilation() -> tensor<!tf_type.resource<tensor<i32>>> {
   // CHECK-NOT:       "tf_device.cluster_func"()
   // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      config = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
   // CHECK-SAME:      _layout = ["sharding_specs: mesh:TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"]
   // CHECK-SAME:      _skip_xla_compilation = true
-  // CHECK-SAME:      config = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
   %0 = "tf_device.cluster_func"() {func = @var_handle_op, _mesh="TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> tensor<!tf_type.resource<tensor<i32>>>
   func.return %0 : tensor<!tf_type.resource<tensor<i32>>>
 }
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
index de72a60589b4dc..9230e34a2fa3d6 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
@@ -4,11 +4,11 @@
 // CHECK-LABEL: func @main
 func.func @main() {
   // CHECK:      %[[VAL_1:.*]] = "tf.Const"
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
   // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
   // CHECK:      %[[VAL_2:.*]] = "tf.Const"
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
   //
   // CHECK:      %[[FILL:.*]] = "tf.Fill"
   // CHECK:      %[[FLATTEN_1:.*]] = "tf.Reshape"(%[[VAL_1]], %cst_{{[0-9]*}})
@@ -41,11 +41,11 @@ func.func @main() {
 // CHECK-LABEL: func @main
 func.func @main() {
   // CHECK:      %[[VAL_1:.*]] = "tf.Const"
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
   // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
   // CHECK:      %[[VAL_2:.*]] = "tf.Const"
-  // CHECK-SAME:   {value = dense<{{1.0.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK-SAME:   <{value = dense<{{1.0.*}}> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
   //
   //
   // CHECK:      %[[ALL_REDUCE_0:.*]] = "tf.DTensorAllReduce"(%[[VAL_2]], %[[GROUP_ASSIGNMENT]])
@@ -97,9 +97,9 @@ func.func @main() {
 // CHECK-LABEL: func @main
 func.func @main() {
   // CHECK:      %[[VAL:.*]] = "tf.Const"
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
   // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
-  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK-SAME:   <{value = dense<{{.*}}> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
   // CHECK:      %[[ALL_REDUCE_1:.*]] = "tf.DTensorAllReduce"(%[[VAL]], %[[GROUP_ASSIGNMENT]])
   // CHECK-SAME:   (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
   //
@@ -272,4 +272,4 @@ module attributes {dtensor.all_reduce_combiner.topological_distance = 2} {
     }) : () -> tensor<4x4xf32>
     "func.return"() : () -> ()
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
index ada3ccb1bfc10e..1c1aa3c16352da 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
@@ -22,12 +22,12 @@ func.func @lower_all_reduce_gpu_mesh(%arg0: tensor<i32>,
   // CHECK:      "tf_device.cluster"
   // CHECK:       %[[DEVICE_ID_RESHAPE:.*]] = "tf.Reshape"(%arg0
   // CHECK:       %[[RELATIVE_DEVICE_ID:.*]] = "tf.Sub"(%[[DEVICE_ID_RESHAPE]]
-  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() {value = dense<[[[GROUP_KEYS:.*]]]> : tensor<8xi32>}
+  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() <{value = dense<[[[GROUP_KEYS:.*]]]> : tensor<8xi32>}>
   // CHECK:       %[[GROUP_KEY_SLICE:.*]] = "tf.Slice"(%[[DEVICE_ID_TO_GROUP_KEY]], %[[RELATIVE_DEVICE_ID]], %[[CONST_1]]
   // CHECK:       %[[GROUP_KEY_RESHAPE:.*]] = "tf.Reshape"(%[[GROUP_KEY_SLICE]]
-  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} 
-  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
+  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}>
   // CHECK:       %[[REDUCE_OUT:.*]] = "tf.CollectiveReduceV2"(%arg1, %[[GROUP_SIZE]], %[[GROUP_KEY_RESHAPE]], %[[INSTANCE_KEY]])
   // CHECK-SAME:  final_op = "Id"
   // CHECK-SAME:  merge_op = "Add"
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir
index c57296818d44bd..c562f592f450aa 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir
@@ -19,10 +19,10 @@ func.func @all_reduce_only() {
 
 // CHECK-LABEL: func @all_reduce_scatter_2d_major_dim
 func.func @all_reduce_scatter_2d_major_dim() {
-    // CHECK:               %[[INPUT:.*]] = "tf.Const"() {value = dense<0.0
-    // CHECK:               %[[GROUP:.*]] = "tf.Const"() {value =
+    // CHECK:               %[[INPUT:.*]] = "tf.Const"() <{value = dense<0.0
+    // CHECK:               %[[GROUP:.*]] = "tf.Const"() <{value =
     // CHECK-SAME{LITERAL}: dense<[[0, 2], [1, 3]]>
-    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() {value = dense<0>
+    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() <{value = dense<0>
     // CHECK:               "tf.DTensorReduceScatter"(%[[INPUT]], %[[GROUP]], %[[SCATTER_DIM]])
     // CHECK-SAME:          reduce_op = "Add"
     // CHECK-NOT:           "tf.DTensorAllReduce"
@@ -41,10 +41,10 @@ func.func @all_reduce_scatter_2d_major_dim() {
 
 // CHECK-LABEL: func @all_reduce_scatter_2d_minor_dim
 func.func @all_reduce_scatter_2d_minor_dim() {
-    // CHECK:               %[[INPUT:.*]] = "tf.Const"() {value = dense<0.0
-    // CHECK:               %[[GROUP:.*]] = "tf.Const"() {value =
+    // CHECK:               %[[INPUT:.*]] = "tf.Const"() <{value = dense<0.0
+    // CHECK:               %[[GROUP:.*]] = "tf.Const"() <{value =
     // CHECK-SAME{LITERAL}: dense<[[0, 2], [1, 3]]>
-    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() {value = dense<1>
+    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() <{value = dense<1>
     // CHECK:               "tf.DTensorReduceScatter"(%[[INPUT]], %[[GROUP]], %[[SCATTER_DIM]])
     // CHECK-SAME:          reduce_op = "Add"
     // CHECK-NOT:           "tf.DTensorAllReduce"
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir
index b964b7b04ad8fc..28c01e0c98114f 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir
@@ -149,13 +149,13 @@ func.func @main(%arg0: tensor<i32>) {
   // CHECK:         "tf.A"
   // CHECK-NEXT:    "tf.Yield"
   // CHECK:         ^bb0(%[[BARG0:.*]]: tensor<4xf32>, %[[BARG1:.*]]: tensor<i32>)
-  // CHECK:          %[[INPUT0:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:     %[[GROUP:.*]] = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // CHECK:          %[[INPUT0:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:     %[[GROUP:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x64xi32>}> : () -> tensor<2x64xi32>
   // CHECK-NEXT:     %[[CAST_OUT:.*]] = "tf.Cast"(%[[INPUT0]])
   // CHECK-NEXT:     %[[ADD_OUT:.*]] = "tf.AddV2"(%[[CAST_OUT]], %[[BARG0]])
   // CHECK-NEXT:     %[[OUT:.*]] = "tf.Identity"(%[[ADD_OUT]])
   // CHECK-NEXT:     "tf.Yield"
-  // CHECK:      %[[NEW_GROUP_ASSIGN:.*]] = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // CHECK:      %[[NEW_GROUP_ASSIGN:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x64xi32>}> : () -> tensor<2x64xi32>
   // CHECK:      %[[ALL_REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[WHILE_OUT]]#0, %[[NEW_GROUP_ASSIGN]])
   %0 = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
   %2 = "tf.Identity"(%0) : (tensor<4xf32>) -> tensor<4xf32>
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
index c57183a605ace2..946720bc476564 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
@@ -39,12 +39,12 @@ func.func @lower_reduce_scatter_gpu_mesh(%arg0: tensor<i32>,
   // CHECK:      "tf_device.cluster"
   // CHECK:       %[[DEVICE_ID_RESHAPE:.*]] = "tf.Reshape"(%arg0
   // CHECK:       %[[RELATIVE_DEVICE_ID:.*]] = "tf.Sub"(%[[DEVICE_ID_RESHAPE]]
-  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 1, 2, 2, 3, 3]> : tensor<8xi32>}
+  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() <{value = dense<[0, 0, 1, 1, 2, 2, 3, 3]> : tensor<8xi32>}>
   // CHECK:       %[[GROUP_KEY_SLICE:.*]] = "tf.Slice"(%[[DEVICE_ID_TO_GROUP_KEY]], %[[RELATIVE_DEVICE_ID]], %[[CONST_1]]
   // CHECK:       %[[GROUP_KEY_RESHAPE:.*]] = "tf.Reshape"(%[[GROUP_KEY_SLICE]]
-  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} 
-  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
+  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() <{value = dense<2> : tensor<i32>}>
   // CHECK:       %[[REDUCE_SCATTER_OUT:.*]] = "tf.CollectiveReduceScatterV2"(%arg1, %[[GROUP_SIZE]], %[[GROUP_KEY_RESHAPE]], %[[INSTANCE_KEY]])
   // CHECK-SAME:  final_op = "Id"
   // CHECK-SAME:  merge_op = "Add"
diff --git a/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
index 2e9b079319b0e0..7bd63d910bb680 100644
--- a/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
+++ b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL: func @main
 func.func @main() {
     // CHECK:        "tf_device.cluster"()
-    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<10> : tensor<i32>}
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"() <{value = dense<10> : tensor<i32>}> {_global_shape = [#tf_type.shape<>]}
     // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
     // CHECK-SAME:     layout = #dtensor.layout<sharding_specs: mesh:CPU|x=4,y=1|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
     // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[DTENSOR_LAYOUT_OUT]])
@@ -56,10 +56,10 @@ func.func @main() {
 func.func @main() {
    %6, %7 = "tf_device.cluster"() ({
     // CHECK:        "tf_device.cluster"()
-    // CHECK-NEXT:     %[[CONST_OUT_1:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<2x2>], value = dense<10> : tensor<2x2xi32>}
+    // CHECK-NEXT:     %[[CONST_OUT_1:.*]] = "tf.Const"() <{value = dense<10> : tensor<2x2xi32>}> {_global_shape = [#tf_type.shape<2x2>]}
     // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_1]])
     // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
-    // CHECK-NEXT:     %[[CONST_OUT_2:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<2x2>], value = dense<10> : tensor<2x2xi32>}
+    // CHECK-NEXT:     %[[CONST_OUT_2:.*]] = "tf.Const"() <{value = dense<10> : tensor<2x2xi32>}> {_global_shape = [#tf_type.shape<2x2>]}
     // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_2]])
     // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
     %1 = "tf.Const"() {value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
@@ -896,7 +896,8 @@ func.func @main(%arg0: tensor<1xi32>,
                 %arg3: tensor<8x32x32x32x3xf32>) {
   // CHECK:      "tf_device.cluster"
   // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropInput"
-  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: data_format = "NDHWC"
   // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   %0 = "tf_device.cluster"() ({
@@ -1018,7 +1019,8 @@ func.func @main(%arg0: tensor<1xi32>,
                 %arg3: tensor<8x32x32x32x3xf32>) {
   // CHECK:      "tf_device.cluster"
   // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropFilter"
-  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: data_format = "NDHWC"
   // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   %0 = "tf_device.cluster"() ({
diff --git a/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
index 5a06d5d620117a..552faafb64e0e3 100644
--- a/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
+++ b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
@@ -11,22 +11,22 @@ func.func @main(%arg0: tensor<i32>) {
   // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[RECV_ID_TO_ORDINAL:.*]], %[[RECV_DEVICE_ID]], %[[RECV_SLICE_SIZE:[^)]*]])
   // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[RECV_DEVICE_ORDINAL]], %[[RECV_SCALAR_TYPE:[^)]*]])
   // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[RECV_DEVICE_ORDINAL_SCALAR]])
-  // CHECK-DAG:    %[[RECV_ID_TO_ORDINAL]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
-  // CHECK-DAG:    %[[RECV_SIZE_TYPE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:    %[[RECV_SLICE_SIZE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:    %[[RECV_SCALAR_TYPE]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-DAG:    %[[RECV_ID_TO_ORDINAL]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[RECV_SIZE_TYPE]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[RECV_SLICE_SIZE]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[RECV_SCALAR_TYPE]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   // COMMENT: Recv and Send seperated by the output tensor.
   // CHECK:   %[[PROGRAM_KEY:.*]] = "tf._XlaCompileMlirPlaceholderProgramKey"
-  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"() <{value = dense<10> : tensor<1xi32>}>
   // CHECK-NEXT:   %[[LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
   // CHECK-DAG:   %[[SEND_DEVICE_ID:.*]] = "tf.Reshape"(%[[DEVICE_ID]], %[[SEND_SIZE_TYPE:[^)]*]])
   // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[SEND_ID_TO_ORDINAL:.*]], %[[SEND_DEVICE_ID]], %[[SEND_SLICE_SIZE:[^)]*]])
   // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[SEND_DEVICE_ORDINAL]], %[[SEND_SCALAR_TYPE:[^)]*]])
   // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[SEND_DEVICE_ORDINAL_SCALAR]])
-  // CHECK-DAG:    %[[SEND_ID_TO_ORDINAL]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
-  // CHECK-DAG:    %[[SEND_SIZE_TYPE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:    %[[SEND_SLICE_SIZE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
-  // CHECK-DAG:    %[[SEND_SCALAR_TYPE]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK-DAG:    %[[SEND_ID_TO_ORDINAL]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[SEND_SIZE_TYPE]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[SEND_SLICE_SIZE]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}>
+  // CHECK-DAG:    %[[SEND_SCALAR_TYPE]] = "tf.Const"() <{value = dense<> : tensor<0xi32>}>
   // CHECK:   "tf._XlaSendFromHostV2"(%[[LAYOUT_OUT]], %[[PROGRAM_KEY]], %[[SEND_DEVICE_ORDINAL_SCALAR_64]])
   // CHECK-NEXT:   %[[RECV_OUT:.*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_KEY]], %[[RECV_DEVICE_ORDINAL_SCALAR_64]])
   // CHECK-SAME:   key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2"
diff --git a/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
index 9295cbf32ad9b5..165e037c9a3541 100644
--- a/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
+++ b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
@@ -6,11 +6,11 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: func @main
   func.func @main(%arg0: tensor<i32>,%arg1: tensor<4xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
     // CHECK:       "tf.StatefulPartitionedCall"
-    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-SAME:  f = @_func_0
+    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-NEXT:  "tf.StatefulPartitionedCall"
-    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:  f = @_func_1
+    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
     "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = [], _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<4xi32>) -> ()
     %0 = "tf.StatefulPartitionedCall"(%arg0) {_layout = ["sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
     func.return %0 : tensor<f32>
@@ -56,21 +56,21 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
     // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
     // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
-    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:0"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device_incarnation = 0
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_0
-    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device_incarnation = 0
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
     // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT: "tf_device.launch"()
@@ -118,11 +118,11 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: func @main
   func.func @main(%arg0: tensor<i32>,%arg1: tensor<*x!tf_type.resource<tensor<4xf32>>> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
     // CHECK:       "tf.StatefulPartitionedCall"
-    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-SAME:  f = @_func_0
+    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-NEXT:  "tf.StatefulPartitionedCall"
-    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:  f = @_func_1
+    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
     "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = [], _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<*x!tf_type.resource<tensor<4xf32>>>) -> ()
     %0 = "tf.StatefulPartitionedCall"(%arg0) {_layout = ["sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
     func.return %0 : tensor<f32>
@@ -168,22 +168,22 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
     // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
     // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
-    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device_incarnation = 0
-    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:0"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device_incarnation = 0
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_0
-    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-SAME:   send_device_incarnation = 0
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
     // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT: "tf_device.launch"()
diff --git a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
index fd7ab209e8c40c..339da803cf4e58 100644
--- a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
+++ b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
@@ -24,11 +24,11 @@ module attributes {
   // CHECK: %arg7: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:7"}
   // CHECK: tf.entry_function = {inputs = "input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7", outputs = "output_0,output_1,output_2,output_3,output_4,output_5,output_6,output_7"
   // CHECK: %[[RES:.*]]:8 = "tf.StatefulPartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7)
+  // CHECK-SAME: f = @_multi_device_func_5372333290171538790_8525065017853554746
   // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"]
-  // CHECK-SAME: f = @_multi_device_func_4093838507448400597_971647271862201157
   // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2, %[[RES]]#3, %[[RES]]#4, %[[RES]]#5, %[[RES]]#6, %[[RES]]#7
 
-  // CHECK-LABEL: func.func private @_multi_device_func_4093838507448400597_971647271862201157
+  // CHECK-LABEL: func.func private @_multi_device_func_5372333290171538790_8525065017853554746(
   // CHECK: %arg0: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}
   // CHECK: %arg1: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:1"}
   // CHECK: %arg2: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:2"}
@@ -38,7 +38,7 @@ module attributes {
   // CHECK: %arg6: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:6"}
   // CHECK: %arg7: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:7"}
   // CHECK: tf.entry_function = {inputs = "input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7", outputs = "output_0,output_1,output_2,output_3,output_4,output_5,output_6,output_7"
-  // CHECK: %[[CST0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST0:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
   // CHECK: %[[CST1:.*]] = "tf.Const"
   // CHECK: %[[CST2:.*]] = "tf.Const"
   // CHECK: %[[CST3:.*]] = "tf.Const"
@@ -134,7 +134,7 @@ module @test_inferred_resource_attributes attributes {dtensor.all_reduce_combine
 // CHECK-SAME: %arg1: tensor<1x2xi32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}
 // CHECK-SAME: -> (tensor<2xi32>, tensor<2xi32>) {
 // CHECK-NEXT:   %0:2 = "tf_device.launch"() ({
-// CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+// CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() <{metadata = ""}> : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
 // CHECK-NEXT:   }) {device = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:   "tf_device.launch"() ({
@@ -190,7 +190,7 @@ module attributes {dtensor.all_reduce_combiner.num_ops_in_group = 0 : i64, dtens
 // CHECK-SAME: %arg2: tensor<!tf_type.resource<tensor<i32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}
 // CHECK-SAME: %arg3: tensor<!tf_type.resource<tensor<i32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}
 // CHECK-NEXT:   %0:2 = "tf_device.launch"() ({
-// CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+// CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() <{metadata = ""}> : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
 // CHECK-NEXT:   }) {device = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:   "tf_device.launch"() ({
@@ -210,8 +210,8 @@ module attributes {dtensor.all_reduce_combiner.num_ops_in_group = 0 : i64, dtens
 // CHECK-NEXT:     }) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<i32>
 // CHECK-NEXT:     tf_device.return %2 : tensor<i32>
 // CHECK-NEXT:   }) : () -> (tensor<i32>, tensor<i32>)
-// CHECK-NEXT:   "tf.AssignVariableOp"(%arg2, %1#0) {_global_shape = [], _layout = [], device = "/job:localhost/replica:0/task:0/device:TPU:0", validate_shape = false} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
-// CHECK-NEXT:   "tf.AssignVariableOp"(%arg3, %1#1) {_global_shape = [], _layout = [], device = "/job:localhost/replica:0/task:0/device:TPU:1", validate_shape = false} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+// CHECK-NEXT:   "tf.AssignVariableOp"(%arg2, %1#0) <{validate_shape = false}> {_global_shape = [], _layout = [], device = "/job:localhost/replica:0/task:0/device:TPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+// CHECK-NEXT:   "tf.AssignVariableOp"(%arg3, %1#1) <{validate_shape = false}> {_global_shape = [], _layout = [], device = "/job:localhost/replica:0/task:0/device:TPU:1"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
 
diff --git a/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
index 18e49f77bacec8..0d4947ccbb17aa 100644
--- a/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
+++ b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
@@ -12,12 +12,12 @@ func.func @main(
   %arg1: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"},
   %arg2: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> (tensor<i32>) {
   // CHECK:      %[[ARG1_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%arg[[ARG_1]])
-  // CHECK-SAME: dtensor.from_arg_index = [[ARG_1]]
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  // CHECK-SAME: dtensor.from_arg_index = [[ARG_1]]
   // CHECK-SAME: (tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: %[[ARG0_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%arg[[ARG_0]])
-  // CHECK-SAME: dtensor.from_arg_index = [[ARG_0]]
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  // CHECK-SAME: dtensor.from_arg_index = [[ARG_0]]
   // CHECK-SAME: (tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: "tf.A"(%[[ARG0_OUT]], %[[ARG1_OUT]])
   // CHECK-NEXT: "tf.B"(%[[ARG1_OUT]])
diff --git a/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir b/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir
index ee6ca5be3bf669..d613c1725741ec 100644
--- a/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir
+++ b/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir
@@ -18,8 +18,8 @@ func.func @main(
     // CHECK-NEXT:       "tf.DTensorLayout"
     // CHECK-NEXT:       "tf.DTensorLayout"
     // CHECK-NEXT:       %[[RESTORE:.*]] = "tf.RestoreV2"(%0, %1, %2) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       %[[DLAYOUT:.*]] = "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       %[[DLAYOUT:.*]] = "tf.DTensorLayout"(%[[RESTORE]]) <{global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>}> : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT]]) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
     "tf_device.cluster"() ({
       %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
       %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
@@ -47,10 +47,10 @@ func.func @main(
     // CHECK-NEXT:       "tf.DTensorLayout"
     // CHECK-NEXT:       "tf.DTensorLayout"
     // CHECK-NEXT:       %[[RESTORE:.*]]  = "tf.RestoreV2"(%0, %1, %2) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       %[[DLAYOUT:.*]]  = "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       %[[CAST:.*]]     = "tf.Cast"(%[[DLAYOUT]]) {Truncate = false} : (tensor<4x8xf32>) -> tensor<4x8xf64>
-    // CHECK-NEXT:       %[[DLAYOUT2:.*]] = "tf.DTensorLayout"(%[[CAST]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf64>) -> tensor<4x8xf64>
-    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT2]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf64>>>, tensor<4x8xf64>) -> ()
+    // CHECK-NEXT:       %[[DLAYOUT:.*]]  = "tf.DTensorLayout"(%[[RESTORE]]) <{global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>}> : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[CAST:.*]]     = "tf.Cast"(%[[DLAYOUT]]) <{Truncate = false}> : (tensor<4x8xf32>) -> tensor<4x8xf64>
+    // CHECK-NEXT:       %[[DLAYOUT2:.*]] = "tf.DTensorLayout"(%[[CAST]]) <{global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>}> : (tensor<4x8xf64>) -> tensor<4x8xf64>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT2]]) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf64>>>, tensor<4x8xf64>) -> ()
     "tf_device.cluster"() ({
       %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
       %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
@@ -75,7 +75,7 @@ func.func @main(
   %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>) {
     // CHECK:        "tf_device.cluster"
     // CHECK-NEXT:       %[[RESOURCE:.*]] = "tf.DTensorLayout"(%arg4)
-    // CHECK-NEXT:       %[[RECV:.*]] = "tf.DTensorRecv"() {
+    // CHECK-NEXT:       %[[RECV:.*]] = "tf.DTensorRecv"() <{
     // CHECK-SAME:       key = "communication_key_|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
     // CHECK-SAME:       mesh = #dtensor.mesh<|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>
     // CHECK-SAME:       shape = #tf_type.shape<4x8>
@@ -84,7 +84,7 @@ func.func @main(
     // CHECK-NEXT:       %[[RECV_DL:.*]] = "tf.DTensorLayout"(%[[RECV]])
     // CHECK-NEXT:       %[[IDENTITY:.*]] = "tf.Identity"(%[[RECV_DL]]) : (tensor<4x8xf32>) -> tensor<4x8xf32>
     // CHECK-NEXT:       %[[IDENTITY_DL:.*]] = "tf.DTensorLayout"(%[[IDENTITY]])
-    // CHECK-NEXT:       "tf.AssignVariableOp"(%[[RESOURCE]], %[[IDENTITY_DL]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%[[RESOURCE]], %[[IDENTITY_DL]]) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
     // CHECK-NEXT:       tf_device.return
     "tf_device.cluster"() ({
       %4 = "tf.DTensorLayout"(%arg4) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1>} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>) -> tensor<*x!tf_type.resource<tensor<4x8xf32>>>
@@ -99,7 +99,7 @@ func.func @main(
     // CHECK-NEXT:       %[[DL2:.*]] = "tf.DTensorLayout"(%arg2)
     // CHECK-NEXT:       %[[DL3:.*]] = "tf.DTensorLayout"(%arg3)
     // CHECK-NEXT:       %[[RESTORE:.*]] = "tf.RestoreV2"(%[[DL1]], %[[DL2]], %[[DL3]]) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.DTensorLayout"(%[[RESTORE]]) <{global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>}> : (tensor<4x8xf32>) -> tensor<4x8xf32>
     // CHECK-NEXT:       "tf.DTensorSend"
     // CHECK-NEXT:       tf_device.return
     "tf_device.cluster"() ({
diff --git a/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir b/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir
index 6f78b552e47f40..a925f5e815a1f8 100644
--- a/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir
+++ b/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir
@@ -3,9 +3,9 @@
 // Check the tf.RestoreV2Op's and all connected ops' resulting types are inferred from the AssignVariableOps in a single mesh. All unknown shapes should be known after this pass.
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<i64>>>) {
     // CHECK:        %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<i64>)
-    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
     // CHECK:        %1 = "tf.Identity"(%0#1) : (tensor<i64>) -> tensor<i64>
-    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
     %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xi64>)
     "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
     %1 = "tf.Identity"(%0#1) {} : (tensor<*xi64>) -> tensor<*xi64>
@@ -19,9 +19,9 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tenso
 // Check the tf.RestoreV2Op's and all connected ops' resulting types are inferred from the AssignVariableOps in cross mesh cluster. All unknown shapes should be known after this pass.
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<i64>>>) {
     // CHECK:        "tf_device.cluster"
-    // CHECK-NEXT:       %2 = "tf.DTensorRecv"() {key = "communication_key_|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", mesh = #dtensor.mesh<|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>, shape = #tf_type.shape<4x8>} : () -> tensor<4x8xf32>
+    // CHECK-NEXT:       %2 = "tf.DTensorRecv"() <{key = "communication_key_|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", mesh = #dtensor.mesh<|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>, shape = #tf_type.shape<4x8>}> : () -> tensor<4x8xf32>
     // CHECK-NEXT:       %3 = "tf.Identity"(%2) : (tensor<4x8xf32>) -> tensor<4x8xf32>
-    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg4, %3) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg4, %3) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
     // CHECK-NEXT:       tf_device.return
     "tf_device.cluster"() ({
       %1 = "tf.DTensorRecv"() {key = "communication_key_|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", mesh = #dtensor.mesh<|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>, shape = #tf_type.shape<*>} : () -> tensor<*xf32>
@@ -33,8 +33,8 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tenso
     // CHECK:        "tf_device.cluster"
     // CHECK-NEXT:       %2:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<i64>)
     // CHECK-NEXT:       %3 = "tf.Identity"(%2#1) : (tensor<i64>) -> tensor<i64>
-    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg5, %3) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
-    // CHECK-NEXT:       "tf.DTensorSend"(%2#0) {key = "communication_key_|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", target_mesh = #dtensor.mesh<|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>} : (tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg5, %3) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    // CHECK-NEXT:       "tf.DTensorSend"(%2#0) <{key = "communication_key_|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", target_mesh = #dtensor.mesh<|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>}> : (tensor<4x8xf32>) -> ()
     // CHECK-NEXT:       tf_device.return
     "tf_device.cluster"() ({
       %6:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) {} : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xi64>)
@@ -51,9 +51,9 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tenso
 // Check correctness of shape inference and element type propagation of a graph containing tf.Cast ops.
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<f32>>>) {
     // CHECK:        %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<bf16>)
-    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
-    // CHECK:        %1 = "tf.Cast"(%0#1) {Truncate = false} : (tensor<bf16>) -> tensor<f32>
-    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) <{validate_shape = true}> : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK:        %1 = "tf.Cast"(%0#1) <{Truncate = false}> : (tensor<bf16>) -> tensor<f32>
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
     %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xbf16>)
     "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
     %1 = "tf.Cast"(%0#1) {} : (tensor<*xbf16>) -> tensor<*xf32>
diff --git a/tensorflow/dtensor/mlir/tests/spmd_concat.mlir b/tensorflow/dtensor/mlir/tests/spmd_concat.mlir
index 8989bda8760e36..0f84bfc7c15d18 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_concat.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_concat.mlir
@@ -39,14 +39,14 @@ func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:unsharde
   // CHECK:       "tf_device.cluster"
   // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
   // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1]])
-  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<8x2x32xf32>) -> tensor<8x8x32xf32>
   // CHECK-NEXT:    %[[ARG2_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG2]])
-  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<8x16x32xf32>
   // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0]], %[[ARG1_RELAYOUT]], %[[ARG2_RELAYOUT]], %[[AXIS]])
   // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
@@ -80,14 +80,14 @@ func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:x,unshar
   // CHECK:       "tf_device.cluster"
   // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
   // CHECK-NEXT:    %[[ARG0_RELAYOUT:.*]] = "tf.DTensorAllScatter"(%[[ARG0]])
-  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<4x4x32xf32>) -> tensor<4x4x16xf32>
   // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllScatter"(%[[ARG1]])
-  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<8x8x16xf32>) -> tensor<4x8x16xf32>
   // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0_RELAYOUT]], %[[ARG1_RELAYOUT]], %[[ARG2]], %[[AXIS]])
   // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
@@ -118,14 +118,14 @@ func.func @main(%arg0: tensor<8x8x32xf32> {tf._layout = "sharding_specs:unsharde
   // CHECK:       "tf_device.cluster"
   // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
   // CHECK-NEXT:    %[[ARG0_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG0]])
-  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<8x8x32xf32>
   // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1]])
-  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<16x8x16xf32>) -> tensor<16x8x32xf32>
   // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0_RELAYOUT]], %[[ARG1_RELAYOUT]], %[[AXIS]])
   // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
@@ -158,19 +158,19 @@ func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:x,unshar
   // CHECK:       "tf_device.cluster"
   // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
   // CHECK-NEXT:    %[[ARG1_SCATTER:.*]] = "tf.DTensorAllScatter"(%[[ARG1]])
-  // CHECK-SAME:      _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<4x4x32xf32>
   // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1_SCATTER]])
-  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<4x4x32xf32>) -> tensor<4x8x32xf32>
   // CHECK-NEXT:    %[[ARG2_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG2]])
-  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
   // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:      (tensor<4x8x32xf32>) -> tensor<4x16x32xf32>
   // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0]], %[[ARG1_RELAYOUT]], %[[ARG2_RELAYOUT]], %[[AXIS]])
   // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
diff --git a/tensorflow/dtensor/mlir/tests/spmd_conv.mlir b/tensorflow/dtensor/mlir/tests/spmd_conv.mlir
index 64da63174489d2..9c776a93068add 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_conv.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_conv.mlir
@@ -221,8 +221,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:         "tf_device.cluster"
 
   // Build left halo on height dim.
-  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 3, 0, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
@@ -231,8 +231,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
   // Build right halo on height dim.
-  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
@@ -241,13 +241,13 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
   // Concat the halos with the shard on the height dim.
-  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_H_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %arg1, %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<8x4x4x3xf32>, tensor<8x1x4x3xf32>, tensor<i64>) -> tensor<8x6x4x3xf32>
 
   // Build left halo on width dim.
-  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 3, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x6x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
@@ -256,8 +256,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
   // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<4x2xi32>) -> tensor<8x6x1x3xf32>
   // Build right halo on width dim.
-  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x6x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
@@ -266,7 +266,7 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
   // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<4x2xi32>) -> tensor<8x6x1x3xf32>
   // Concat the halos with the shard on the width dim.
-  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_HW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[CONCAT_H_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
   // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<8x6x4x3xf32>, tensor<8x6x1x3xf32>, tensor<i64>) -> tensor<8x6x6x3xf32>
 
@@ -293,8 +293,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:         "tf_device.cluster"
 
   // Build left halo on depth dim.
-  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_D_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_LEFT_BEGIN]], %[[SLICE_D_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
   // CHECK-NEXT:      %[[HALO_D_LEFT:.*]] = "tf.SelectV2"
@@ -303,8 +303,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_D_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_D_LEFT]], %[[PAIRS_D_LEFT]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
   // Build right halo on depth dim.
-  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_D_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_RIGHT_BEGIN]], %[[SLICE_D_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
   // CHECK-NEXT:      %[[HALO_D_RIGHT:.*]] = "tf.SelectV2"
@@ -313,13 +313,13 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_D_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_D_RIGHT]], %[[PAIRS_D_RIGHT]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
   // Concat the halos with the shard on the depth dim.
-  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_D_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_D_LEFT]], %arg1, %[[EXCHANGED_HALO_D_RIGHT]], %[[CONCAT_D_AXIS]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x4x4x4x3xf32>, tensor<8x1x4x4x3xf32>, tensor<i64>) -> tensor<8x6x4x4x3xf32>
 
   // Build left halo on height dim.
-  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
@@ -328,8 +328,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
   // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x6x1x4x3xf32>
   // Build right halo on height dim.
-  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
@@ -338,13 +338,13 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
   // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x6x1x4x3xf32>
   // Concat the halos with the shard on the height dim.
-  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_DH_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %[[CONCAT_D_TENSOR]], %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
   // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x6x4x4x3xf32>, tensor<8x6x1x4x3xf32>, tensor<i64>) -> tensor<8x6x6x4x3xf32>
 
   // Build left halo on width dim.
-  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x6x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x6x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
@@ -353,8 +353,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
   // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x2xi32>) -> tensor<8x6x6x1x3xf32>
   // Build right halo on width dim.
-  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x6x6x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x6x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
@@ -363,7 +363,7 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
   // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x2xi32>) -> tensor<8x6x6x1x3xf32>
   // Concat the halos with the shard on the width dim.
-  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() <{value = dense<3> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_DHW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[CONCAT_DH_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
   // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x6x6x4x3xf32>, tensor<8x6x6x1x3xf32>, tensor<i64>) -> tensor<8x6x6x6x3xf32>
 
@@ -390,8 +390,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:         "tf_device.cluster"
 
   // Build left halo on height dim.
-  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 3, 0, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
@@ -400,8 +400,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
   // Build right halo on height dim.
-  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
@@ -410,21 +410,21 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
   // Concat the halos with the shard on the height dim.
-  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_H_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %arg1, %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
   // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<8x4x4x3xf32>, tensor<8x1x4x3xf32>, tensor<i64>) -> tensor<8x6x4x3xf32>
   // Dynamically slice the concatenated tensor to get correct size for VALID padding.
-  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() <{value = dense<[0, 1, 0, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() <{value = dense<[0, 1, 0, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[VALID_OFFSET_H:.*]] = "tf.Mul"
   // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_H:.*]] = "tf.Sub"(%[[HALO_SIZES_H]], %[[VALID_OFFSET_H]])
-  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() {value = dense<[8, 5, 4, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
-  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_H:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_H]]) {Truncate = false} : (tensor<4xi32>) -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() <{value = dense<[8, 5, 4, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_H:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_H]]) <{Truncate = false}> : (tensor<4xi32>) -> tensor<4xi64>
   // CHECK-NEXT:      %[[VALID_SLICE_H_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[VALID_SLICE_BEGIN_CAST_I64_H]], %[[VALID_SLICE_SIZE_H]])
 
   // Build left halo on width dim.
-  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 3, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_H_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x5x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
@@ -433,8 +433,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
   // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<4x2xi32>) -> tensor<8x5x1x3xf32>
   // Build right halo on width dim.
-  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_H_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x5x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
@@ -443,16 +443,16 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
   // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<4x2xi32>) -> tensor<8x5x1x3xf32>
   // Concat the halos with the shard on the width dim.
-  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_HW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[VALID_SLICE_H_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
   // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<8x5x4x3xf32>, tensor<8x5x1x3xf32>, tensor<i64>) -> tensor<8x5x6x3xf32>
   // Dynamically slice the concatenated tensor to get correct size for VALID padding.
-  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() <{value = dense<[0, 0, 1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() <{value = dense<[0, 0, 1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
   // CHECK-NEXT:      %[[VALID_OFFSET_W:.*]] = "tf.Mul"
   // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_W:.*]] = "tf.Sub"(%[[HALO_SIZES_W]], %[[VALID_OFFSET_W]])
-  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
-  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_W:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_W]]) {Truncate = false} : (tensor<4xi32>) -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() <{value = dense<[8, 5, 5, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_W:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_W]]) <{Truncate = false}> : (tensor<4xi32>) -> tensor<4xi64>
   // CHECK-NEXT:      %[[VALID_SLICE_HW_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_HW_TENSOR]], %[[VALID_SLICE_BEGIN_CAST_I64_W]], %[[VALID_SLICE_SIZE_W]])
 
   // CHECK-NEXT:      "tf.Conv2D"(%[[VALID_SLICE_HW_TENSOR]], %arg2)
@@ -478,8 +478,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:         "tf_device.cluster"
 
   // Build left halo on depth dim.
-  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_D_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_LEFT_BEGIN]], %[[SLICE_D_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
   // CHECK-NEXT:      %[[HALO_D_LEFT:.*]] = "tf.SelectV2"
@@ -488,8 +488,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_D_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_D_LEFT]], %[[PAIRS_D_LEFT]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
   // Build right halo on depth dim.
-  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_D_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_RIGHT_BEGIN]], %[[SLICE_D_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
   // CHECK-NEXT:      %[[HALO_D_RIGHT:.*]] = "tf.SelectV2"
@@ -498,21 +498,21 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_D_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_D_RIGHT]], %[[PAIRS_D_RIGHT]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
   // Concat the halos with the shard on the depth dim.
-  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() <{value = dense<1> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_D_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_D_LEFT]], %arg1, %[[EXCHANGED_HALO_D_RIGHT]], %[[CONCAT_D_AXIS]])
   // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x4x4x4x3xf32>, tensor<8x1x4x4x3xf32>, tensor<i64>) -> tensor<8x6x4x4x3xf32>
   // Dynamically slice the concatenated tensor to get correct size for VALID padding.
-  // CHECK-NEXT:      %[[HALO_SIZES_D:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[HALO_INCREMENTS_D:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_SIZES_D:.*]] = "tf.Const"() <{value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_D:.*]] = "tf.Const"() <{value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[VALID_OFFSET_D:.*]] = "tf.Mul"
   // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_D:.*]] = "tf.Sub"(%[[HALO_SIZES_D]], %[[VALID_OFFSET_D]])
-  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_D:.*]] = "tf.Const"() {value = dense<[8, 5, 4, 4, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
-  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_D:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_D]]) {Truncate = false} : (tensor<5xi32>) -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_D:.*]] = "tf.Const"() <{value = dense<[8, 5, 4, 4, 3]> : tensor<5xi64>}> : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_D:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_D]]) <{Truncate = false}> : (tensor<5xi32>) -> tensor<5xi64>
   // CHECK-NEXT:      %[[VALID_SLICE_D_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[VALID_SLICE_BEGIN_CAST_I64_D]], %[[VALID_SLICE_SIZE_D]])
 
   // Build left halo on height dim.
-  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_D_TENSOR]], %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
@@ -521,8 +521,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
   // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x5x1x4x3xf32>
   // Build right halo on height dim.
-  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_D_TENSOR]], %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x1x4x3xf32>
   // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
@@ -531,21 +531,21 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
   // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x5x1x4x3xf32>
   // Concat the halos with the shard on the height dim.
-  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() <{value = dense<2> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_DH_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %[[VALID_SLICE_D_TENSOR]], %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
   // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x5x4x4x3xf32>, tensor<8x5x1x4x3xf32>, tensor<i64>) -> tensor<8x5x6x4x3xf32>
   // Dynamically slice the concatenated tensor to get correct size for VALID padding.
-  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() <{value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() <{value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[VALID_OFFSET_H:.*]] = "tf.Mul"
   // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_H:.*]] = "tf.Sub"(%[[HALO_SIZES_H]], %[[VALID_OFFSET_H]])
-  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 4, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
-  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_H:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_H]]) {Truncate = false} : (tensor<5xi32>) -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() <{value = dense<[8, 5, 5, 4, 3]> : tensor<5xi64>}> : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_H:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_H]]) <{Truncate = false}> : (tensor<5xi32>) -> tensor<5xi64>
   // CHECK-NEXT:      %[[VALID_SLICE_DH_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[VALID_SLICE_BEGIN_CAST_I64_H]], %[[VALID_SLICE_SIZE_H]])
 
   // Build left halo on width dim.
-  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() <{value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_DH_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x5x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x5x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
@@ -554,8 +554,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
   // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x2xi32>) -> tensor<8x5x5x1x3xf32>
   // Build right halo on width dim.
-  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() <{value = dense<0> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() <{value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_DH_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
   // CHECK-SAME:          (tensor<8x5x5x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x5x1x3xf32>
   // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
@@ -564,16 +564,16 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
   // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x2xi32>) -> tensor<8x5x5x1x3xf32>
   // Concat the halos with the shard on the width dim.
-  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() <{value = dense<3> : tensor<i64>}> : () -> tensor<i64>
   // CHECK-NEXT:      %[[CONCAT_DHW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[VALID_SLICE_DH_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
   // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x5x5x4x3xf32>, tensor<8x5x5x1x3xf32>, tensor<i64>) -> tensor<8x5x5x6x3xf32>
   // Dynamically slice the concatenated tensor to get correct size for VALID padding.
-  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
-  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() <{value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() <{value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>}> : () -> tensor<5xi32>
   // CHECK-NEXT:      %[[VALID_OFFSET_W:.*]] = "tf.Mul"
   // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_W:.*]] = "tf.Sub"(%[[HALO_SIZES_W]], %[[VALID_OFFSET_W]])
-  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 5, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
-  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_W:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_W]]) {Truncate = false} : (tensor<5xi32>) -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() <{value = dense<[8, 5, 5, 5, 3]> : tensor<5xi64>}> : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_CAST_I64_W:.*]] = "tf.Cast"(%[[VALID_SLICE_BEGIN_W]]) <{Truncate = false}> : (tensor<5xi32>) -> tensor<5xi64>
   // CHECK-NEXT:      %[[VALID_SLICE_DHW_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_DHW_TENSOR]], %[[VALID_SLICE_BEGIN_CAST_I64_W]], %[[VALID_SLICE_SIZE_W]])
 
   // CHECK-NEXT:      "tf.Conv3D"(%[[VALID_SLICE_DHW_TENSOR]], %arg2)
diff --git a/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
index c8ab88a949abd3..4e35ca89476344 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
@@ -171,7 +171,7 @@ module @test_spmd_const_op_sharded {
 module @test_spmd_const_op_sharded_with_splat {
 func.func @main(%arg0: tensor<i32>) {
   // CHECK:        "tf_device.cluster"
-  // CHECK-NEXT:      %[[CONST_OUT:.*]] = "tf.Const"() {[[BEFORE_ATTR:.*]]value = dense<1> : tensor<1xi32>[[AFTER_ATTR:.*]]} : () -> tensor<1xi32>
+  // CHECK-NEXT:      %[[CONST_OUT:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> {{{.*}}} : () -> tensor<1xi32>
   // CHECK-NEXT:      tf_device.return
   %0 = "tf_device.cluster"() ({
    %1 = "tf.Const"() {_layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value=dense<1>: tensor<2xi32>} : () -> tensor<2xi32>
@@ -340,8 +340,8 @@ func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,z
 module @test_spmd_random_op_with_incomplete_shape_disallowed {
 func.func @main(%arg0: tensor<i32>) {
   %0 = "tf_device.cluster"() ({
-    // %1 = "tf.Const"() {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = "", value = dense<16> : tensor<2xi32>} : () -> tensor<2xi32>
-    // %2 = "tf.Const"() {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = "", value = dense<[123, 321]> : tensor<2xi32>} : () -> tensor<2xi32>
+    // %1 = "tf.Const"() <{value = dense<16> : tensor<2xi32>}> {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = ""} : () -> tensor<2xi32>
+    // %2 = "tf.Const"() <{value = dense<[123, 321]> : tensor<2xi32>}> {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = ""} : () -> tensor<2xi32>
     %1 = arith.constant dense<[16]> : tensor<1xi32>
     %2 = arith.constant dense<[2, 1]> : tensor<2xi32>
     // expected-error @+1 {{Sharding dimension of random op does not match rank of the random op}}
@@ -764,7 +764,7 @@ func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded
   // CHECK:      "tf_device.cluster"
   // CHECK-NEXT:   "tf.Const"()
   // CHECK-NEXT:   %[[INDICES:.*]] = "tf.Const"()
-  // CHECK-NEXT:   %[[NEW_SHAPE:.*]] = "tf.Const"() {value = dense<[16, 2, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-NEXT:   %[[NEW_SHAPE:.*]] = "tf.Const"() <{value = dense<[16, 2, 4]> : tensor<3xi32>}> : () -> tensor<3xi32>
   // CHECK-NEXT:   "tf.ScatterNd"(%[[INDICES]], %arg0, %[[NEW_SHAPE]])
   %0 = "tf_device.cluster"() ({
     %shape = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[16, 4, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
diff --git a/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir b/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir
index ca2cc5f43456e8..aba9c8c1fb3836 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir
@@ -64,8 +64,8 @@ func.func @main(
   // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<8x16xf32>
   // CHECK:        "tf.WhileRegion"
   // CHECK:        %[[ITER_OPTIONAL_OUT:.*]] = "tf.IteratorGetNextAsOptional"(%arg1)
-  // CHECK-SAME:     _layout = ["sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
   // CHECK-SAME:     output_shapes = [#tf_type.shape<8x16>]
+  // CHECK-SAME:     _layout = ["sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
   // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<!tf_type.variant>
   // CHECK-NEXT:   %[[HAS_VALUE:.*]] = "tf.OptionalHasValue"(%[[ITER_OPTIONAL_OUT]])
   // CHECK-SAME:     _layout = ["sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
diff --git a/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir b/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir
index da1661329d105a..2d85bf11ff74a0 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir
@@ -8,10 +8,10 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:    "tf_device.cluster"
   // CHECK:      %[[MATMUL_OUT:.*]] = "tf.BatchMatMulV2"(%arg1, %arg2)
   // CHECK-SAME: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
   // CHECK:      %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
-  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   // CHECK-NEXT: tf_device.return
   // CHECK-SAME: %[[SUM_OUT]]
@@ -35,10 +35,10 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:    "tf_device.cluster"
   // CHECK:      %[[MATMUL_OUT:.*]] = "tf.BatchMatMulV2"
   // CHECK-SAME: (tensor<4x2x2xi32>, tensor<4x2x2xi32>) -> tensor<4x2x2xi32>
-  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
   // CHECK:      %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
-  // CHECK-SAME: _layout = ["sharding_specs:x,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME: _layout = ["sharding_specs:x,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: (tensor<4x2x2xi32>, tensor<2x2xi32>) -> tensor<4x2x2xi32>
   // CHECK-NEXT: tf_device.return
   // CHECK-SAME: %[[SUM_OUT]]
@@ -80,8 +80,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-SAME:   (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   // CHECK-NEXT:   %[[GROUP_ID:.*]] = "tf.Const"()
   // CHECK-NEXT:   %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
-  // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME:   (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
   // CHECK-NEXT:   tf_device.return
   // CHECK-SAME:   %[[SUM_OUT]]
diff --git a/tensorflow/dtensor/mlir/tests/spmd_random.mlir b/tensorflow/dtensor/mlir/tests/spmd_random.mlir
index 257d7c6137fd4d..183908ddf0eb2a 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_random.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_random.mlir
@@ -30,7 +30,7 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:      "tf_device.cluster"
   // CHECK-NEXT: %[[MESH_SIZES:.*]] = "tf.Const"()
   // CHECK-SAME: 4, 2, 2
-  // CHECK-NEXT: %[[MESH_SIZES_RUNNING_PRODUCT:.*]] = "tf.Const"() {value =
+  // CHECK-NEXT: %[[MESH_SIZES_RUNNING_PRODUCT:.*]] = "tf.Const"() <{value =
   // CHECK-SAME: 4, 2, 1
   // CHECK-NEXT: %[[MESH_COORDS_PRE_MOD:.*]] = "tf.Div"(%arg0, %[[MESH_SIZES_RUNNING_PRODUCT]])
   // CHECK-NEXT: %[[MESH_COORDS:.*]] = "tf.FloorMod"(%[[MESH_COORDS_PRE_MOD]], %[[MESH_SIZES]])
@@ -38,15 +38,15 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK-NEXT: %[[MESH_MULTIPLER:.*]] = "tf.Const"()
   // CHECK-SAME [65536], [0], [262144]
   // CHECK-NEXT: %[[DEVICE_SEED:.*]] = "tf.MatMul"(%[[MESH_COORDS]], %[[MESH_MULTIPLER]])
-  // CHECK-NEXT: %[[PRIME:.*]] = "tf.Const"() {value = dense<65521>
+  // CHECK-NEXT: %[[PRIME:.*]] = "tf.Const"() <{value = dense<65521>
   // CHECK-NEXT: %[[DEVICE_SEED_PRIME:.*]] = "tf.AddV2"(%[[DEVICE_SEED]], %[[PRIME]])
-  // CHECK-NEXT: %[[DEVICE_SEED_SQUEEZE:.*]] = "tf.Squeeze"(%[[DEVICE_SEED_PRIME]]) {
+  // CHECK-NEXT: %[[DEVICE_SEED_SQUEEZE:.*]] = "tf.Squeeze"(%[[DEVICE_SEED_PRIME]]) <{
   // CHECK-NOT: dtensor.device_seed_for_mesh_dims
   // CHECK-SAME: }
   // CHECK-NEXT: %[[OLD_SHAPE:.*]] = "tf.Const"(
   // CHECK-NEXT: %[[DEVICE_SEED_CAST:.*]] = "tf.Cast"(%[[DEVICE_SEED_SQUEEZE]])
   // CHECK-NEXT: %[[NEW_SEED:.*]] = "tf.BitwiseXor"(%arg1, %[[DEVICE_SEED_CAST]])
-  // CHECK-NEXT: %[[NEW_SHAPE:.*]] = "tf.Const"() {value = dense<[8, 32, 32]>
+  // CHECK-NEXT: %[[NEW_SHAPE:.*]] = "tf.Const"() <{value = dense<[8, 32, 32]>
   // CHECK-NEXT: %[[RANDOM:.*]] = "tf.StatelessRandomUniform"(%[[NEW_SHAPE]], %[[NEW_SEED]])
   // CHECK-NEXT: tf_device.return
   // CHECK-SAME: %[[RANDOM]]
diff --git a/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir b/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir
index c315360b00e3a8..43e67144077976 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir
@@ -5,9 +5,9 @@
 func.func @main(%arg0: tensor<i32>) {
   "tf_device.cluster"() ({
     // CHECK:      "tf.Case"
-    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]], is_stateless = false
     // CHECK:      func private @tf.[[D0]]
-    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      %[[CST:.*]] = "tf.Const"() <{value = dense<"_dev-0-of-2">
     // CHECK:      "tf.Add"(%arg0, %[[CST]])
     // CHECK:      ""
     // CHECK:      func private @tf.[[D1]]
@@ -31,13 +31,13 @@ func.func @main(%arg0: tensor<i32>) {
 func.func @main(%arg0: tensor<i32>) {
   "tf_device.cluster"() ({
     // CHECK:      tf.Case
-    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]], is_stateless = false
     // CHECK:      func private @tf.[[D0]]
-    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      %[[CST:.*]] = "tf.Const"() <{value = dense<"_dev-0-of-2">
     // CHECK:      "tf.Add"(%arg0, %[[CST]])
     // CHECK:      "2 0,1"
     // CHECK:      func private @tf.[[D1]]
-    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-1-of-2">
+    // CHECK:      %[[CST:.*]] = "tf.Const"() <{value = dense<"_dev-1-of-2">
     // CHECK:      "tf.Add"(%arg0, %[[CST]])
     // CHECK:      "2 1,1"
     %0 = "tf.Const"() {value = dense<"/dev/null"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
@@ -66,9 +66,9 @@ func.func @main(%arg0: tensor<i32>) {
     // CHECK:      func private @tf.[[THEN]]
     // CHECK:      tf.NoOp
     // CHECK:      func private @tf.[[ELSE]]
-    // CHECK:      "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      "tf.Const"() <{value = dense<"_dev-0-of-2">
     // CHECK:      "tf.Add"
-    // CHECK:      "tf.Const"() {value = dense<"_dev-1-of-2">
+    // CHECK:      "tf.Const"() <{value = dense<"_dev-1-of-2">
     // CHECK:      "tf.Add"
     // CHECK:      "tf.Concat"
     // CHECK:      "tf.MergeV2Checkpoints"
@@ -109,11 +109,11 @@ func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
       // CHECK:      "tf.Reshape"(%[[BRANCH_IDX]]
       // CHECK-SAME: (tensor<1x1xi32>, tensor<0xi32>) -> tensor<i32>
       // CHECK:      tf.Case
-      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]], is_stateless = false
       // CHECK:      func private @tf.[[D0]]
-      // CHECK:      "tf.Const"() {value = dense<["", "2 4 0,1:-"]>
+      // CHECK:      "tf.Const"() <{value = dense<["", "2 4 0,1:-"]>
       // CHECK:      func private @tf.[[D1]]
-      // CHECK:      "tf.Const"() {value = dense<["", "2 4 1,1:-"]>
+      // CHECK:      "tf.Const"() <{value = dense<["", "2 4 1,1:-"]>
       %1 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
       %2 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<["model/r/.ATTRIBUTES/VARIABLE_VALUE", "model/s/.ATTRIBUTES/VARIABLE_VALUE"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
       %3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<"/dev/null/ckpt-0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
@@ -167,11 +167,11 @@ func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
       // CHECK:      "tf.Reshape"(%[[BRANCH_IDX]]
       // CHECK-SAME: (tensor<1x1xi32>, tensor<0xi32>) -> tensor<i32>
       // CHECK:      tf.Case
-      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]], is_stateless = false
       // CHECK:      func private @tf.[[D0]]
-      // CHECK:      "tf.Const"() {value = dense<["", "2 4 0,1:-"]>
+      // CHECK:      "tf.Const"() <{value = dense<["", "2 4 0,1:-"]>
       // CHECK:      func private @tf.[[D1]]
-      // CHECK:      "tf.Const"() {value = dense<["", "2 4 1,1:-"]>
+      // CHECK:      "tf.Const"() <{value = dense<["", "2 4 1,1:-"]>
       %1 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
       %2 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<["model/r/.ATTRIBUTES/VARIABLE_VALUE", "model/s/.ATTRIBUTES/VARIABLE_VALUE"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
       %3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<"/dev/null/ckpt-0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
diff --git a/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir b/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir
index 2122b72d419da5..6a96c604829466 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir
@@ -11,8 +11,8 @@ func.func @main(%arg0: tensor<1xi32>,
   // CHECK:      %[[LOCAL_RESULT:.*]] = "tf.UnsortedSegmentSum"(%arg1, %arg2, %[[NUM_SEGMENTS]])
   // CHECK-SAME: (tensor<4x2xf32>, tensor<4xi32>, tensor<i32>) -> tensor<8x2xf32>
   // CHECK:      %[[RESULT:.*]] = "tf.DTensorAllReduce"(%[[LOCAL_RESULT]]
-  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
   // CHECK:      %[[FINAL_RESULT:.*]] = "tf.DTensorAllScatter"(%[[RESULT]]
   // CHECK-NEXT: tf_device.return
   // CHECK-SAME: %[[FINAL_RESULT]]
diff --git a/tensorflow/dtensor/mlir/tests/spmd_slice.mlir b/tensorflow/dtensor/mlir/tests/spmd_slice.mlir
index 6741b0c2638583..fa1b556f4b0fd4 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_slice.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_slice.mlir
@@ -93,7 +93,7 @@ func.func @main(%arg0: tensor<i32>,
            %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:|x=2,y=2|*CPU"},
            %arg2: tensor<2xi64> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<1x4xf32> {
   // CHECK:      "tf_device.cluster"
-  // CHECK:        %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK:        %[[SLICE_SIZE:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
   // CHECK-NEXT:   %[[SLICE:.*]] = "tf.Slice"(%arg1, %arg2, %[[SLICE_SIZE]])
   // CHECK-SAME:     _layout = ["sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   // CHECK-SAME:     (tensor<2x2xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x2xf32>
@@ -114,7 +114,7 @@ func.func @main(%arg0: tensor<i32>,
 // Check SPMD expansion of strided slice op with replicated input.
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<2x2xf32> {
   // CHECK:      "tf_device.cluster"
-  // CHECK:      %cst_2 = "tf.Const"() {value = dense<2> : tensor<2xi32>}
+  // CHECK:      %cst_2 = "tf.Const"() <{value = dense<2> : tensor<2xi32>}>
   // CHECK:        "tf.StridedSlice"(%arg1, %cst, %cst_2, %cst_1)
   // CHECK-SAME:     _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
   // CHECK-SAME:     (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
diff --git a/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir b/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir
index 5bfda19d43bca4..401940d53b51a4 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir
@@ -41,8 +41,8 @@ func.func @main(%arg0: tensor<i32> , %arg1: tensor<2x1xf32> { tf._layout = "shar
 // CHECK-LABEL: func @main
 func.func @main(%arg0: tensor<i32> , %arg1: tensor<2x1xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<2xf32> {
   // CHECK:      "tf.Squeeze"(%arg1)
-  // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|
   // CHECK-SAME: squeeze_dims = [1]
+  // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|
   // CHECK-SAME: (tensor<1x1xf32>) -> tensor<1xf32>
   %0 = "tf_device.cluster"() ({
     %1 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<2x1>], global_shape = #tf_type.shape<2x1>,
diff --git a/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir b/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir
index b152d296e77cd9..956a3505dc25b5 100644
--- a/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir
+++ b/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir
@@ -3,9 +3,9 @@
 // Check that constants with different meshes are duplicated.
 // CHECK-LABEL: func @check_undo_sccp
 func.func @check_undo_sccp() -> (tensor<4xi32>, tensor<4xi32>) {
-    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_A:.*]]) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_A:.*]]) <{global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>}> : (tensor<4xi32>) -> tensor<4xi32>
     // CHECK-DAG: %[[CONST_A]] = "tf.Const"()
-    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_B:.*]]) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_B:.*]]) <{global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>}> : (tensor<4xi32>) -> tensor<4xi32>
     // CHECK-DAG: %[[CONST_B]] = "tf.Const"()
 
     %cst = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<4xi32>
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index e037fe36485c21..556065859b5ed4 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -242,6 +242,7 @@ dtensor_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:stateless_random_ops",
diff --git a/tensorflow/dtensor/python/tests/layout_test.py b/tensorflow/dtensor/python/tests/layout_test.py
index 5fdb2d1c008cba..2dabf159e76e4e 100644
--- a/tensorflow/dtensor/python/tests/layout_test.py
+++ b/tensorflow/dtensor/python/tests/layout_test.py
@@ -30,6 +30,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
@@ -388,6 +389,30 @@ def test_data_parallel_layout(self):
         tensor_layout.num_shards(0), _2D_MESH.dim_size(_MESH_DIM_BATCH))
     self.assertEqual(tensor_layout.num_shards(1), 1)
 
+  def test_global_shape_from_local_shape(self):
+    tensor_layout = layout.Layout(
+        [_MESH_DIM_BATCH, _MESH_DIM_X, layout.UNSHARDED],
+        mesh=_2D_MESH,
+    )
+    self.assertEqual(
+        tensor_layout.global_shape_from_local_shape(
+            tensor_shape.TensorShape((1, 3, 5))
+        ),
+        (2, 6, 5),
+    )
+
+  def test_local_shape_from_global_shape(self):
+    tensor_layout = layout.Layout(
+        [_MESH_DIM_BATCH, _MESH_DIM_X, layout.UNSHARDED],
+        mesh=_2D_MESH,
+    )
+    self.assertEqual(
+        tensor_layout.local_shape_from_global_shape(
+            tensor_shape.TensorShape((2, 6, 5))
+        ),
+        (1, 3, 5),
+    )
+
   def test_single_device_layout(self):
     tensor_layout = layout.Layout.from_single_device_mesh(_SINGLE_DEVICE_MESH)
     tensor_layout2 = layout.Layout.from_device(
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 83bc76290047bb..eec8c318b50c61 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -268,7 +268,11 @@ else()
 endif()
 
 populate_tflite_source_vars("core" TFLITE_CORE_SRCS)
-populate_tflite_source_vars("core/acceleration/configuration" TFLITE_CORE_ACCELERATION_SRCS)
+populate_tflite_source_vars(
+  "core/acceleration/configuration" TFLITE_CORE_ACCELERATION_SRCS
+  FILTER "xnnpack_plugin.*"
+  FILTER "(_test)\\.(cc|h)$"
+)
 populate_tflite_source_vars("core/api" TFLITE_CORE_API_SRCS)
 populate_tflite_source_vars("core/async" TFLITE_CORE_ASYNC_SRCS)
 populate_tflite_source_vars("core/async/c" TFLITE_CORE_ASYNC_C_SRCS)
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index b005381b7dbbd4..4f1b2fe568cb97 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -17,7 +17,7 @@ load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite", "cc_test_with_tflite")
 
 # copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
 load(":build_defs.bzl", "flatbuffer_schema_compat_test")
@@ -324,29 +324,53 @@ cc_test(
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "xnnpack_plugin",
-    srcs = ["xnnpack_plugin.cc"],
     compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration:xnnpack_plugin"],
+)
+
+cc_test_with_tflite(
+    name = "xnnpack_plugin_with_tflite_test",
+    srcs = ["xnnpack_plugin_test.cc"],
+    # The variant of this test that links against TF Lite in Play services
+    # isn't portable to iOS / Mac or Android, because it relies on a separate
+    # shared library that isn't included in the executable, and the testing
+    # infrastructure for iOS and Android doesn't propagate data dependencies
+    # to the test device.  So we disable this test on those devices.
+    # TODO(b/306161304): ideally we ought to apply these tags only to the
+    # variant for TF Lite in Play services.  In the mean time, we apply those
+    # tags to the whole test, but also duplicate the test below using cc_test
+    # without the tags.
+    tags = [
+        "no_mac",
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    tflite_deps = [
+        ":xnnpack_plugin",
+        "//tensorflow/lite:test_util",
+        "//tensorflow/lite/acceleration/configuration:delegate_registry",
+    ],
     deps = [
         ":configuration_fbs",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "@com_google_absl//absl/base:log_severity",
-        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+        "@pthreadpool",
     ],
-    alwayslink = 1,  # For registration to always run.
 )
 
+# This duplicates xnnnpack_plugin_with_tflite_test above, but without the tags,
+# to ensure that this test does get run on iOS and Android.
 cc_test(
     name = "xnnpack_plugin_test",
     srcs = ["xnnpack_plugin_test.cc"],
     deps = [
         ":configuration_fbs",
         ":xnnpack_plugin",
-        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite:test_util",
+        "//tensorflow/lite/acceleration/configuration:delegate_registry",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers//:runtime_cc",
         "@pthreadpool",
diff --git a/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc b/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
index 2aa1d95a44f10d..3138e7f2c584be 100644
--- a/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
+++ b/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "pthreadpool.h"  // from @pthreadpool
 #include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/test_util.h"
 
 namespace tflite {
 
-class XnnpackPluginTest : public testing::Test {
+class XnnpackPluginTest : public tflite::testing::Test {
  public:
   static constexpr int kNumThreadsForTest = 7;
   static constexpr tflite::XNNPackFlags kFlagsForTest =
@@ -70,22 +70,14 @@ class XnnpackPluginTest : public testing::Test {
 constexpr int XnnpackPluginTest::kNumThreadsForTest;
 
 TEST_F(XnnpackPluginTest, CanCreateAndDestroyDelegate) {
-  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  delegates::TfLiteOpaqueDelegatePtr delegate = delegate_plugin_->Create();
   EXPECT_NE(delegate, nullptr);
 }
 
 TEST_F(XnnpackPluginTest, CanGetDelegateErrno) {
-  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  delegates::TfLiteOpaqueDelegatePtr delegate = delegate_plugin_->Create();
   int error_number = delegate_plugin_->GetDelegateErrno(delegate.get());
   EXPECT_EQ(error_number, 0);
 }
 
-TEST_F(XnnpackPluginTest, SetsCorrectThreadCount) {
-  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
-  pthreadpool_t threadpool = static_cast<pthreadpool_t>(
-      TfLiteXNNPackDelegateGetThreadPool(delegate.get()));
-  int thread_count = pthreadpool_get_threads_count(threadpool);
-  EXPECT_EQ(thread_count, kNumThreadsForTest);
-}
-
 }  // namespace tflite
diff --git a/tensorflow/lite/core/acceleration/configuration/BUILD b/tensorflow/lite/core/acceleration/configuration/BUILD
index a1c1b2d45a6313..1f461966c8af37 100644
--- a/tensorflow/lite/core/acceleration/configuration/BUILD
+++ b/tensorflow/lite/core/acceleration/configuration/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
-load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist")
+load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
+load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -88,3 +88,35 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "xnnpack_plugin",
+    srcs = ["xnnpack_plugin.cc"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "xnnpack_plugin_test",
+    srcs = ["xnnpack_plugin_test.cc"],
+    deps = [
+        ":xnnpack_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+        "@pthreadpool",
+    ],
+)
diff --git a/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
index 157fa9a5c3bc19..c8db4753bd8d49 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
+// WARNING: Users of TensorFlow Lite should not include this file directly,
+// but should instead include
+// "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h".
+// Only the TensorFlow Lite implementation itself should include this
+// file directly.
 // NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
-// C API types for TF Lite delegate plugins.
+/// C API types for TF Lite delegate plugins.
 
 #include "tensorflow/lite/core/c/common.h"
 
@@ -30,50 +30,60 @@ limitations under the License.
 extern "C" {
 #endif
 
-// Type of delegate creation function used to allocate and construct a delegate.
-//
-// The tflite_settings parameter passed to the delegate creation function should
-// be a pointer to a FlatBuffer table object of type tflite::TFLiteSettings.
-// We use 'const void *' here rather than 'const tflite::TFLiteSettings*' since
-// this is a C API so we don't want to directly reference C++ types such as
-// tflite::TFLiteSettings.  But note that this address should point to the
-// 'parsed' FlatBuffer object, not the raw byte buffer.
-// (Note that 'parsing' FlatBuffers is very cheap, it's just an offset load.)
-//
-// If you are using the FlatBuffers C API, then you can alternatively pass
-// in a value of type 'tflite_TFLiteSettings_table_t', which is a typedef for
-// 'const struct tflite_TFLiteSettings_table*' -- that is the corresponding
-// type for the 'parsed' FlatBuffer object in the FlatBuffers C API.
-//
-// Ownership of the tflite_settings flatbuffer remains with the caller.
-// The caller of a delegate creation function may end the lifetime of the
-// tflite_settings FlatBuffer immediately after the call to the function.
-// So the delegate creation function should ensure that any settings that the
-// delegate may need to reference later, after the delegate has been
-// constructed, are copied from the FlatBuffer into storage owned by the
-// delegate.
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup delegate_plugin tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// Type of delegate creation function used to allocate and construct a
+/// delegate.
+///
+/// The tflite_settings parameter passed to the delegate creation function
+/// should be a pointer to a FlatBuffer table object of type
+/// tflite::TFLiteSettings. We use 'const void *' here rather than 'const
+/// tflite::TFLiteSettings*' since this is a C API so we don't want to directly
+/// reference C++ types such as tflite::TFLiteSettings.  But note that this
+/// address should point to the 'parsed' FlatBuffer object, not the raw byte
+/// buffer. (Note that 'parsing' FlatBuffers is very cheap, it's just an offset
+/// load.)
+///
+/// If you are using the FlatBuffers C API, then you can alternatively pass
+/// in a value of type 'tflite_TFLiteSettings_table_t', which is a typedef for
+/// 'const struct tflite_TFLiteSettings_table*' -- that is the corresponding
+/// type for the 'parsed' FlatBuffer object in the FlatBuffers C API.
+///
+/// Ownership of the tflite_settings flatbuffer remains with the caller.
+/// The caller of a delegate creation function may end the lifetime of the
+/// tflite_settings FlatBuffer immediately after the call to the function.
+/// So the delegate creation function should ensure that any settings that the
+/// delegate may need to reference later, after the delegate has been
+/// constructed, are copied from the FlatBuffer into storage owned by the
+/// delegate.
 typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
     const void *tflite_settings);
 
-// Type of function to destroy and deallocate a delegate.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
+/// Type of function to destroy and deallocate a delegate.
+/// The delegate argument must have been created with the corresponding
+/// create function from the same delegate plugin.
 typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
 
-// Type of function to return an error code for the last delegate operation.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
+/// Type of function to return an error code for the last delegate operation.
+/// The delegate argument must have been created with the corresponding
+/// create function from the same delegate plugin.
 typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
 
-// Struct to hold all the methods for a delegate plugin.
+/// Struct to hold all the methods for a delegate plugin.
 typedef struct TfLiteDelegatePlugin {
-  // Function to allocate and construct a delegate.
+  /// Function to allocate and construct a delegate.
   TfLiteDelegatePluginCreateFunc *create;
 
-  // Function to deallocate a delegate.
+  /// Function to deallocate a delegate.
   TfLiteDelegatePluginDestroyFunc *destroy;
 
-  // Function to return an error code for the last delegate operation.
+  /// Function to return an error code for the last delegate operation.
   TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
 } TfLiteDelegatePlugin;
 
@@ -115,6 +125,8 @@ typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
 
 #endif  // TFLITE_USE_OPAQUE_DELEGATE
 
+/** @} */
+
 #ifdef __cplusplus
 };  // extern "C"
 #endif
diff --git a/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
index cbee7f128d918b..4001405a7b4647 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
+// WARNING: Users of TensorFlow Lite should not include this file directly,
+// but should instead include
+// "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h".
+// Only the TensorFlow Lite implementation itself should include this
+// file directly.
 // NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
-// This header file is for the delegate plugin for GPU.
-//
-// For the C++ delegate plugin interface, the GPU delegate plugin is added to
-// the DelegatePluginRegistry by the side effect of a constructor for a static
-// object, so there's no public API needed for this plugin, other than the API
-// of tflite::delegates::DelegatePluginRegistry, which is declared in
-// delegate_registry.h.
-//
-// But to provide a C API to access the GPU delegate plugin, we do expose
-// some functions, which are declared below.
+/// This header file is for the delegate plugin for GPU.
+///
+/// For the C++ delegate plugin interface, the GPU delegate plugin is added to
+/// the DelegatePluginRegistry by the side effect of a constructor for a static
+/// object, so there's no public API needed for this plugin, other than the API
+/// of tflite::delegates::DelegatePluginRegistry, which is declared in
+/// delegate_registry.h.
+///
+/// But to provide a C API to access the GPU delegate plugin, we do expose
+/// some functions, which are declared below.
 
 #include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
@@ -39,10 +39,20 @@ limitations under the License.
 extern "C" {
 #endif
 
-// C API for the GPU delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup gpu_plugin tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// C API for the GPU delegate plugin.
+/// Returns a pointer to a statically allocated table of function pointers.
 const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
 
+/** @} */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
index 224e7869cb9e53..1cb63815605be9 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
@@ -22,16 +22,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
-// This header file is for the delegate plugin for XNNPACK.
-//
-// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
-// to the DelegatePluginRegistry by the side effect of a constructor for a
-// static object, so there's no public API needed for this plugin, other than
-// the API of tflite::delegates::DelegatePluginRegistry, which is declared in
-// delegate_registry.h.
-//
-// But to provide a C API to access the XNNPACK delegate plugin, we do expose
-// some functions, which are declared below.
+/// This header file is for the delegate plugin for XNNPACK.
+///
+/// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
+/// to the DelegatePluginRegistry by the side effect of a constructor for a
+/// static object, so there's no public API needed for this plugin, other than
+/// the API of tflite::delegates::DelegatePluginRegistry, which is declared in
+/// delegate_registry.h.
+///
+/// But to provide a C API to access the XNNPACK delegate plugin, we do expose
+/// some functions, which are declared below.
 
 #include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
@@ -39,10 +39,20 @@ limitations under the License.
 extern "C" {
 #endif
 
-// C API for the XNNPACK delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup xnnpack_plugin tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// C API for the XNNPACK delegate plugin.
+/// Returns a pointer to a statically allocated table of function pointers.
 const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
 
+/** @} */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/tensorflow/lite/acceleration/configuration/xnnpack_plugin.cc b/tensorflow/lite/core/acceleration/configuration/xnnpack_plugin.cc
similarity index 100%
rename from tensorflow/lite/acceleration/configuration/xnnpack_plugin.cc
rename to tensorflow/lite/core/acceleration/configuration/xnnpack_plugin.cc
diff --git a/tensorflow/lite/core/acceleration/configuration/xnnpack_plugin_test.cc b/tensorflow/lite/core/acceleration/configuration/xnnpack_plugin_test.cc
new file mode 100644
index 00000000000000..2aa1d95a44f10d
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/xnnpack_plugin_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Some very simple unit tests of the (C++) XNNPack Delegate Plugin.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "pthreadpool.h"  // from @pthreadpool
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+
+class XnnpackPluginTest : public testing::Test {
+ public:
+  static constexpr int kNumThreadsForTest = 7;
+  static constexpr tflite::XNNPackFlags kFlagsForTest =
+      tflite::XNNPackFlags::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8;
+  void SetUp() override {
+    // Construct a FlatBuffer that contains
+    //   TFLiteSettings {
+    //     delegate: Delegate.XNNPACK,
+    //     XNNPackSettings { num_threads: kNumThreadsForTest
+    //                       flags: TFLITE_XNNPACK_DELEGATE_FLAG_QS8 |
+    //                           TFLITE_XNNPACK_DELEGATE_FLAG_QU8
+    //     }
+    //   }.
+    XNNPackSettingsBuilder xnnpack_settings_builder(flatbuffer_builder_);
+    xnnpack_settings_builder.add_num_threads(kNumThreadsForTest);
+    xnnpack_settings_builder.add_flags(kFlagsForTest);
+    flatbuffers::Offset<XNNPackSettings> xnnpack_settings =
+        xnnpack_settings_builder.Finish();
+    TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder_);
+    tflite_settings_builder.add_xnnpack_settings(xnnpack_settings);
+    tflite_settings_builder.add_delegate(Delegate_XNNPACK);
+    flatbuffers::Offset<TFLiteSettings> tflite_settings =
+        tflite_settings_builder.Finish();
+    flatbuffer_builder_.Finish(tflite_settings);
+    tflite_settings_ = flatbuffers::GetRoot<TFLiteSettings>(
+        flatbuffer_builder_.GetBufferPointer());
+    // Create an XNNPack delegate plugin using the settings from the flatbuffer.
+    delegate_plugin_ = delegates::DelegatePluginRegistry::CreateByName(
+        "XNNPackPlugin", *tflite_settings_);
+    ASSERT_NE(delegate_plugin_, nullptr);
+  }
+  void TearDown() override { delegate_plugin_.reset(); }
+  ~XnnpackPluginTest() override {}
+
+ protected:
+  // settings_ points into storage owned by flatbuffer_builder_.
+  flatbuffers::FlatBufferBuilder flatbuffer_builder_;
+  const TFLiteSettings *tflite_settings_;
+  std::unique_ptr<delegates::DelegatePluginInterface> delegate_plugin_;
+};
+
+constexpr int XnnpackPluginTest::kNumThreadsForTest;
+
+TEST_F(XnnpackPluginTest, CanCreateAndDestroyDelegate) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  EXPECT_NE(delegate, nullptr);
+}
+
+TEST_F(XnnpackPluginTest, CanGetDelegateErrno) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  int error_number = delegate_plugin_->GetDelegateErrno(delegate.get());
+  EXPECT_EQ(error_number, 0);
+}
+
+TEST_F(XnnpackPluginTest, SetsCorrectThreadCount) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  pthreadpool_t threadpool = static_cast<pthreadpool_t>(
+      TfLiteXNNPackDelegateGetThreadPool(delegate.get()));
+  int thread_count = pthreadpool_get_threads_count(threadpool);
+  EXPECT_EQ(thread_count, kNumThreadsForTest);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 916dbafe541bdb..0afb882883b72d 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -138,7 +138,10 @@ cc_test(
     deps = [
         ":api",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
     ],
 )
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 314a1f8b805123..0e73b00097f135 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -15,11 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
+#include <cstdarg>
+#include <cstdio>
 #include <cstring>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index e299970e1691ec..3fee59e8110e38 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -81,9 +81,13 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-/** \addtogroup c_api tensorflow/lite/c/c_api.h
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api tensorflow/lite/c/c_api.h
  *  @{
  */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 // This header should be valid in both C (e.g. C99) and C++,
 // so 'void' in parameters is not redundant.
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index b3999ee3fd3f0a..b45016b2547879 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -37,9 +37,13 @@ extern "C" {
 /// schedule than for the other TensorFlow Lite APIs. See
 /// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
 
-/** \addtogroup c_api_opaque tensorflow/lite/c/c_api_opaque.h
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api_opaque tensorflow/lite/c/c_api_opaque.h
  *  @{
  */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 // --------------------------------------------------------------------------
 // Accessors for TfLiteOpaqueTensor.
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
index 3a6594dae43e14..eb6d0bfc0606dc 100644
--- a/tensorflow/lite/core/c/c_api_types.h
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -34,9 +34,13 @@ limitations under the License.
 extern "C" {
 #endif
 
-/** \addtogroup c_api_types tensorflow/lite/c/c_api_types.h
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup c_api_types tensorflow/lite/c/c_api_types.h
  *  @{
  */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 // Define TFL_CAPI_EXPORT macro to export a function properly with a shared
 // library.
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index e39b6c0181c4f9..dc347f512d99c2 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines common C types and APIs for implementing operations,
-// delegates and other constructs in TensorFlow Lite. The actual operations and
-// delegates can be defined using C++, but the interface between the interpreter
-// and the operations are C.
-//
-// Summary of abstractions
-// TF_LITE_ENSURE - Self-sufficient error checking
-// TfLiteStatus - Status reporting
-// TfLiteIntArray - stores tensor shapes (dims),
-// TfLiteContext - allows an op to access the tensors
-// TfLiteTensor - tensor (a multidimensional array)
-// TfLiteNode - a single node or operation
-// TfLiteRegistration - the implementation of a conceptual operation.
-// TfLiteDelegate - allows delegation of nodes to alternative backends.
-//
-// Some abstractions in this file are created and managed by Interpreter.
-//
-// NOTE: The order of values in these structs are "semi-ABI stable". New values
-// should be added only to the end of structs and never reordered.
-
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/c/common.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
+/// This file defines common C types and APIs for implementing operations,
+/// delegates and other constructs in TensorFlow Lite. The actual operations and
+/// delegates can be defined using C++, but the interface between the
+/// interpreter and the operations are C.
+///
+/// Summary of abstractions
+/// TF_LITE_ENSURE - Self-sufficient error checking
+/// TfLiteStatus - Status reporting
+/// TfLiteIntArray - stores tensor shapes (dims),
+/// TfLiteContext - allows an op to access the tensors
+/// TfLiteTensor - tensor (a multidimensional array)
+/// TfLiteNode - a single node or operation
+/// TfLiteRegistration - the implementation of a conceptual operation.
+/// TfLiteDelegate - allows delegation of nodes to alternative backends.
+///
+/// Some abstractions in this file are created and managed by Interpreter.
+///
+/// NOTE: The order of values in these structs are "semi-ABI stable". New values
+/// should be added only to the end of structs and never reordered.
+
+// WARNING: Users of TensorFlow Lite should not include this file directly,
+// but should instead include
+// "third_party/tensorflow/lite/c/common.h".
+// Only the TensorFlow Lite implementation itself should include this
+// file directly.
 // IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -54,15 +54,23 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// The list of external context types known to TF Lite. This list exists solely
-// to avoid conflicts and to ensure ops can share the external contexts they
-// need. Access to the external contexts is controlled by one of the
-// corresponding support files.
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/** \defgroup common tensorflow/lite/c/common.h
+ *  @{
+ */
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
+/// The list of external context types known to TF Lite. This list exists solely
+/// to avoid conflicts and to ensure ops can share the external contexts they
+/// need. Access to the external contexts is controlled by one of the
+/// corresponding support files.
 typedef enum TfLiteExternalContextType {
-  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
-  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
-  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
-  kTfLiteCpuBackendContext = 3,  // include cpu_backend_context.h to use.
+  kTfLiteEigenContext = 0,       /// include eigen_support.h to use.
+  kTfLiteGemmLowpContext = 1,    /// include gemm_support.h to use.
+  kTfLiteEdgeTpuContext = 2,     /// Placeholder for Edge TPU support.
+  kTfLiteCpuBackendContext = 3,  /// include cpu_backend_context.h to use.
   kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;
 
@@ -73,11 +81,11 @@ struct TfLiteDelegate;
 struct TfLiteRegistration;
 struct TfLiteOpaqueDelegateBuilder;
 
-// An external context is a collection of information unrelated to the TF Lite
-// framework, but useful to a subset of the ops. TF Lite knows very little
-// about the actual contexts, but it keeps a list of them, and is able to
-// refresh them if configurations like the number of recommended threads
-// change.
+/// An external context is a collection of information unrelated to the TF Lite
+/// framework, but useful to a subset of the ops. TF Lite knows very little
+/// about the actual contexts, but it keeps a list of them, and is able to
+/// refresh them if configurations like the number of recommended threads
+/// change.
 typedef struct TfLiteExternalContext {
   TfLiteExternalContextType type;
   TfLiteStatus (*Refresh)(struct TfLiteContext* context);
@@ -85,8 +93,8 @@ typedef struct TfLiteExternalContext {
 
 #define kTfLiteOptionalTensor (-1)
 
-// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
-// indices
+/// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+/// indices
 typedef struct TfLiteIntArray {
   int size;
 
@@ -105,33 +113,33 @@ typedef struct TfLiteIntArray {
 #endif
 } TfLiteIntArray;
 
-// Given the size (number of elements) in a TfLiteIntArray, calculate its size
-// in bytes.
+/// Given the size (number of elements) in a TfLiteIntArray, calculate its size
+/// in bytes.
 size_t TfLiteIntArrayGetSizeInBytes(int size);
 
 #ifndef TF_LITE_STATIC_MEMORY
-// Create a array of a given `size` (uninitialized entries).
-// This returns a pointer, that you must free using TfLiteIntArrayFree().
+/// Create a array of a given `size` (uninitialized entries).
+/// This returns a pointer, that you must free using TfLiteIntArrayFree().
 TfLiteIntArray* TfLiteIntArrayCreate(int size);
 #endif
 
-// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
+/// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
 int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b);
 
-// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
+/// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
 int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
                               const int b_data[]);
 
 #ifndef TF_LITE_STATIC_MEMORY
-// Create a copy of an array passed as `src`.
-// You are expected to free memory with TfLiteIntArrayFree
+/// Create a copy of an array passed as `src`.
+/// You are expected to free memory with TfLiteIntArrayFree
 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
-// Free memory of array `a`.
+/// Free memory of array `a`.
 void TfLiteIntArrayFree(TfLiteIntArray* a);
 #endif  // TF_LITE_STATIC_MEMORY
 
-// Fixed size list of floats. Used for per-channel quantization.
+/// Fixed size list of floats. Used for per-channel quantization.
 typedef struct TfLiteFloatArray {
   int size;
 #if defined(_MSC_VER)
@@ -149,20 +157,20 @@ typedef struct TfLiteFloatArray {
 #endif
 } TfLiteFloatArray;
 
-// Given the size (number of elements) in a TfLiteFloatArray, calculate its size
-// in bytes.
+/// Given the size (number of elements) in a TfLiteFloatArray, calculate its
+/// size in bytes.
 int TfLiteFloatArrayGetSizeInBytes(int size);
 
 #ifndef TF_LITE_STATIC_MEMORY
-// Create a array of a given `size` (uninitialized entries).
-// This returns a pointer, that you must free using TfLiteFloatArrayFree().
+/// Create a array of a given `size` (uninitialized entries).
+/// This returns a pointer, that you must free using TfLiteFloatArrayFree().
 TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
 
-// Create a copy of an array passed as `src`.
-// You are expected to free memory with TfLiteFloatArrayFree.
+/// Create a copy of an array passed as `src`.
+/// You are expected to free memory with TfLiteFloatArrayFree.
 TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
 
-// Free memory of array `a`.
+/// Free memory of array `a`.
 void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 #endif  // TF_LITE_STATIC_MEMORY
 
@@ -263,61 +271,62 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                      \
   } while (0)
 
-// Single-precision complex data type compatible with the C99 definition.
+/// Single-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex64 {
-  float re, im;  // real and imaginary parts, respectively.
+  float re, im;  /// real and imaginary parts, respectively.
 } TfLiteComplex64;
 
-// Double-precision complex data type compatible with the C99 definition.
+/// Double-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex128 {
-  double re, im;  // real and imaginary parts, respectively.
+  double re, im;  /// real and imaginary parts, respectively.
 } TfLiteComplex128;
 
-// Half precision data type compatible with the C99 definition.
+/// Half precision data type compatible with the C99 definition.
 typedef struct TfLiteFloat16 {
   uint16_t data;
 } TfLiteFloat16;
 
-// Return the name of a given type, for error reporting purposes.
+/// Return the name of a given type, for error reporting purposes.
 const char* TfLiteTypeGetName(TfLiteType type);
 
-// SupportedQuantizationTypes.
+/// SupportedQuantizationTypes.
 typedef enum TfLiteQuantizationType {
-  // No quantization.
+  /// No quantization.
   kTfLiteNoQuantization = 0,
-  // Affine quantization (with support for per-channel quantization).
-  // Corresponds to TfLiteAffineQuantization.
+  /// Affine quantization (with support for per-channel quantization).
+  /// Corresponds to TfLiteAffineQuantization.
   kTfLiteAffineQuantization = 1,
 } TfLiteQuantizationType;
 
-// Structure specifying the quantization used by the tensor, if-any.
+/// Structure specifying the quantization used by the tensor, if-any.
 typedef struct TfLiteQuantization {
-  // The type of quantization held by params.
+  /// The type of quantization held by params.
   TfLiteQuantizationType type;
-  // Holds an optional reference to a quantization param structure. The actual
-  // type depends on the value of the `type` field (see the comment there for
-  // the values and corresponding types).
+  /// Holds an optional reference to a quantization param structure. The actual
+  /// type depends on the value of the `type` field (see the comment there for
+  /// the values and corresponding types).
   void* params;
 } TfLiteQuantization;
 
-// Parameters for asymmetric quantization across a dimension (i.e per output
-// channel quantization).
-// quantized_dimension specifies which dimension the scales and zero_points
-// correspond to.
-// For a particular value in quantized_dimension, quantized values can be
-// converted back to float using:
-//     real_value = scale * (quantized_value - zero_point)
+/// Parameters for asymmetric quantization across a dimension (i.e per output
+/// channel quantization).
+/// quantized_dimension specifies which dimension the scales and zero_points
+/// correspond to.
+/// For a particular value in quantized_dimension, quantized values can be
+/// converted back to float using:
+///     real_value = scale * (quantized_value - zero_point)
 typedef struct TfLiteAffineQuantization {
   TfLiteFloatArray* scale;
   TfLiteIntArray* zero_point;
   int32_t quantized_dimension;
 } TfLiteAffineQuantization;
 
-/* A union of pointers that points to memory for a given tensor. */
+/// A union of pointers that points to memory for a given tensor.
+///
+/// Do not access these members directly, if possible, use
+/// GetTensorData<TYPE>(tensor) instead, otherwise only access .data, as other
+/// members are deprecated.
 typedef union TfLitePtrUnion {
-  /* Do not access these members directly, if possible, use
-   * GetTensorData<TYPE>(tensor) instead, otherwise only access .data, as other
-   * members are deprecated. */
   int32_t* i32;
   uint32_t* u32;
   int64_t* i64;
@@ -334,24 +343,25 @@ typedef union TfLitePtrUnion {
   TfLiteComplex64* c64;
   TfLiteComplex128* c128;
   int8_t* int8;
-  /* Only use this member. */
+  /// Only use this member.
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies.
-//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
-//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
-//        and available during eval.
-//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
-//        only available during eval.
-//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
-//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
-//        useful for tensors that can be computed during prepare and treated
-//        as constant inputs for downstream ops (also in prepare).
-//  * kTfLiteCustom: Custom memory allocation provided by the user. See
-//        TfLiteCustomAllocation below.
-// * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
-//        Allocation and deallocation are done through `new` and `delete`.
+/// Memory allocation strategies.
+///  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally
+///        allocated.
+///  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+///        and available during eval.
+///  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+///        only available during eval.
+///  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+///  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+///        useful for tensors that can be computed during prepare and treated
+///        as constant inputs for downstream ops (also in prepare).
+///  * kTfLiteCustom: Custom memory allocation provided by the user. See
+///        TfLiteCustomAllocation below.
+///  * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
+///        Allocation and deallocation are done through `new` and `delete`.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -363,30 +373,30 @@ typedef enum TfLiteAllocationType {
   kTfLiteVariantObject,
 } TfLiteAllocationType;
 
-// Memory allocation strategies.
-//
-// TfLiteAllocationType values have been overloaded to mean more than their
-// original intent. This enum should only be used to document the allocation
-// strategy used by a tensor for it data.
+/// Memory allocation strategies.
+///
+/// TfLiteAllocationType values have been overloaded to mean more than their
+/// original intent. This enum should only be used to document the allocation
+/// strategy used by a tensor for it data.
 typedef enum TfLiteAllocationStrategy {
   kTfLiteAllocationStrategyUnknown,
-  kTfLiteAllocationStrategyNone,    // No data is allocated.
-  kTfLiteAllocationStrategyMMap,    // Data is mmaped.
-  kTfLiteAllocationStrategyArena,   // Handled by the arena.
-  kTfLiteAllocationStrategyMalloc,  // Uses `malloc`/`free`.
-  kTfLiteAllocationStrategyNew      // Uses `new[]`/`delete[]`.
+  kTfLiteAllocationStrategyNone,    /// No data is allocated.
+  kTfLiteAllocationStrategyMMap,    /// Data is mmaped.
+  kTfLiteAllocationStrategyArena,   /// Handled by the arena.
+  kTfLiteAllocationStrategyMalloc,  /// Uses `malloc`/`free`.
+  kTfLiteAllocationStrategyNew      /// Uses `new[]`/`delete[]`.
 } TfLiteAllocationStrategy;
 
-// Describes how stable a tensor attribute is with regards to an interpreter
-// runs.
+/// Describes how stable a tensor attribute is with regards to an interpreter
+/// runs.
 typedef enum TfLiteRunStability {
   kTfLiteRunStabilityUnknown,
-  kTfLiteRunStabilityUnstable,   // May change at any time.
-  kTfLiteRunStabilitySingleRun,  // Will stay the same for one run.
-  kTfLiteRunStabilityAcrossRuns  // Will stay the same across all runs.
+  kTfLiteRunStabilityUnstable,   /// May change at any time.
+  kTfLiteRunStabilitySingleRun,  /// Will stay the same for one run.
+  kTfLiteRunStabilityAcrossRuns  /// Will stay the same across all runs.
 } TfLiteRunStability;
 
-// Describes the steps of a TFLite operation life cycle.
+/// Describes the steps of a TFLite operation life cycle.
 typedef enum TfLiteRunStep {
   kTfLiteRunStepUnknown,
   kTfLiteRunStepInit,
@@ -394,20 +404,20 @@ typedef enum TfLiteRunStep {
   kTfLiteRunStepEval
 } TfLiteRunStep;
 
-// The delegates should use zero or positive integers to represent handles.
-// -1 is reserved from unallocated status.
+/// The delegates should use zero or positive integers to represent handles.
+/// -1 is reserved from unallocated status.
 typedef int TfLiteBufferHandle;
 enum {
   kTfLiteNullBufferHandle = -1,
 };
 
-// Storage format of each dimension in a sparse tensor.
+/// Storage format of each dimension in a sparse tensor.
 typedef enum TfLiteDimensionType {
   kTfLiteDimDense = 0,
   kTfLiteDimSparseCSR,
 } TfLiteDimensionType;
 
-// Metadata to encode each dimension in a sparse tensor.
+/// Metadata to encode each dimension in a sparse tensor.
 typedef struct TfLiteDimensionMetadata {
   TfLiteDimensionType format;
   int dense_size;
@@ -415,8 +425,8 @@ typedef struct TfLiteDimensionMetadata {
   TfLiteIntArray* array_indices;
 } TfLiteDimensionMetadata;
 
-// Parameters used to encode a sparse tensor. For detailed explanation of each
-// field please refer to lite/schema/schema.fbs.
+/// Parameters used to encode a sparse tensor. For detailed explanation of each
+/// field please refer to lite/schema/schema.fbs.
 typedef struct TfLiteSparsity {
   TfLiteIntArray* traversal_order;
   TfLiteIntArray* block_map;
@@ -424,133 +434,139 @@ typedef struct TfLiteSparsity {
   int dim_metadata_size;
 } TfLiteSparsity;
 
-// Defines a custom memory allocation not owned by the runtime.
-// `data` should be aligned to kDefaultTensorAlignment defined in
-// lite/util.h. (Currently 64 bytes)
-// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+/// Defines a custom memory allocation not owned by the runtime.
+/// `data` should be aligned to kDefaultTensorAlignment defined in
+/// lite/util.h. (Currently 64 bytes)
+/// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
 typedef struct TfLiteCustomAllocation {
   void* data;
   size_t bytes;
 } TfLiteCustomAllocation;
 
-// The flags used in `Interpreter::SetCustomAllocationForTensor`.
-// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+/// The flags used in `Interpreter::SetCustomAllocationForTensor`.
+/// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
 typedef enum TfLiteCustomAllocationFlags {
   kTfLiteCustomAllocationFlagsNone = 0,
-  // Skips checking whether allocation.data points to an aligned buffer as
-  // expected by the TFLite runtime.
-  // NOTE: Setting this flag can cause crashes when calling Invoke().
-  // Use with caution.
+  /// Skips checking whether allocation.data points to an aligned buffer as
+  /// expected by the TFLite runtime.
+  /// NOTE: Setting this flag can cause crashes when calling Invoke().
+  /// Use with caution.
   kTfLiteCustomAllocationFlagsSkipAlignCheck = 1,
 } TfLiteCustomAllocationFlags;
 
-// A tensor in the interpreter system which is a wrapper around a buffer of
-// data including a dimensionality (or NULL if not currently defined).
+/// A tensor in the interpreter system which is a wrapper around a buffer of
+/// data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
+  /// The data type specification for data stored in `data`. This affects
+  /// what member of `data` union should be used.
   TfLiteType type;
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
+  /// A union of data pointers. The appropriate type should be used for a typed
+  /// tensor based on `type`.
   TfLitePtrUnion data;
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
+  /// A pointer to a structure representing the dimensionality interpretation
+  /// that the buffer should have. NOTE: the product of elements of `dims`
+  /// and the element datatype size should be equal to `bytes` below.
   TfLiteIntArray* dims;
-  // Quantization information.
+  /// Quantization information.
   TfLiteQuantizationParams params;
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
+  /// How memory is mapped
+  ///  kTfLiteMmapRo: Memory mapped read only.
+  ///  i.e. weights
+  ///  kTfLiteArenaRw: Arena allocated read write memory
+  ///  (i.e. temporaries, outputs).
   TfLiteAllocationType allocation_type;
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  /// The number of bytes required to store the data of this Tensor. I.e.
+  /// (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  /// type is kTfLiteFloat32 and dims = {3, 2} then
+  /// bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
   size_t bytes;
 
-  // An opaque pointer to a tflite::MMapAllocation
+  /// An opaque pointer to a tflite::MMapAllocation
   const void* allocation;
 
-  // Null-terminated name of this tensor.
+  /// Null-terminated name of this tensor.
   const char* name;
 
-  // The delegate which knows how to handle `buffer_handle`.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// The delegate which knows how to handle `buffer_handle`.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   struct TfLiteDelegate* delegate;
 
-  // An integer buffer handle that can be handled by `delegate`.
-  // The value is valid only when delegate is not null.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// An integer buffer handle that can be handled by `delegate`.
+  /// The value is valid only when delegate is not null.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteBufferHandle buffer_handle;
 
-  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
-  // responsible to set data_is_stale to true.
-  // `delegate->CopyFromBufferHandle` can be called to copy the data from
-  // delegate buffer.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// If the delegate uses its own buffer (e.g. GPU memory), the delegate is
+  /// responsible to set data_is_stale to true.
+  /// `delegate->CopyFromBufferHandle` can be called to copy the data from
+  /// delegate buffer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   bool data_is_stale;
 
-  // True if the tensor is a variable.
+  /// True if the tensor is a variable.
   bool is_variable;
 
-  // Quantization information. Replaces params field above.
+  /// Quantization information. Replaces params field above.
   TfLiteQuantization quantization;
 
-  // Parameters used to encode a sparse tensor.
-  // This is optional. The field is NULL if a tensor is dense.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Parameters used to encode a sparse tensor.
+  /// This is optional. The field is NULL if a tensor is dense.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteSparsity* sparsity;
 
-  // Optional. Encodes shapes with unknown dimensions with -1. This field is
-  // only populated when unknown dimensions exist in a read-write tensor (i.e.
-  // an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
-  // `dims_signature` contains [1, -1, -1, 3]).  If no unknown dimensions exist
-  // then `dims_signature` is either null, or set to an empty array.  Note that
-  // this field only exists when TF_LITE_STATIC_MEMORY is not defined.
+  /// Optional. Encodes shapes with unknown dimensions with -1. This field is
+  /// only populated when unknown dimensions exist in a read-write tensor (i.e.
+  /// an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
+  /// `dims_signature` contains [1, -1, -1, 3]).  If no unknown dimensions exist
+  /// then `dims_signature` is either null, or set to an empty array.  Note that
+  /// this field only exists when TF_LITE_STATIC_MEMORY is not defined.
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
 
-// A structure representing an instance of a node.
-// This structure only exhibits the inputs, outputs, user defined data and some
-// node properties (like statefulness), not other features like the type.
+/// A structure representing an instance of a node.
+/// This structure only exhibits the inputs, outputs, user defined data and some
+/// node properties (like statefulness), not other features like the type.
 typedef struct TfLiteNode {
-  // Inputs to this node expressed as indices into the simulator's tensors.
+  /// Inputs to this node expressed as indices into the simulator's tensors.
   TfLiteIntArray* inputs;
 
-  // Outputs to this node expressed as indices into the simulator's tensors.
+  /// Outputs to this node expressed as indices into the simulator's tensors.
   TfLiteIntArray* outputs;
 
-  // intermediate tensors to this node expressed as indices into the simulator's
-  // tensors.
+  /// intermediate tensors to this node expressed as indices into the
+  /// simulator's tensors.
   TfLiteIntArray* intermediates;
 
-  // Temporary tensors uses during the computations. This usually contains no
-  // tensors, but ops are allowed to change that if they need scratch space of
-  // any sort.
+  /// Temporary tensors uses during the computations. This usually contains no
+  /// tensors, but ops are allowed to change that if they need scratch space of
+  /// any sort.
   TfLiteIntArray* temporaries;
 
-  // Opaque data provided by the node implementer through `Registration.init`.
+  /// Opaque data provided by the node implementer through `Registration.init`.
   void* user_data;
 
-  // Opaque data provided to the node if the node is a builtin. This is usually
-  // a structure defined in builtin_op_data.h
+  /// Opaque data provided to the node if the node is a builtin. This is usually
+  /// a structure defined in builtin_op_data.h
   void* builtin_data;
 
-  // Custom initial data. This is the opaque data provided in the flatbuffer.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Custom initial data. This is the opaque data provided in the flatbuffer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   const void* custom_initial_data;
   int custom_initial_data_size;
 
-  // The pointer to the delegate. This is non-null only when the node is
-  // created by calling `interpreter.ModifyGraphWithDelegate`.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// The pointer to the delegate. This is non-null only when the node is
+  /// created by calling `interpreter.ModifyGraphWithDelegate`.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   struct TfLiteDelegate* delegate;
 
-  // Whether this op might have side effect (e.g. stateful op).
+  /// Whether this op might have side effect (e.g. stateful op).
   bool might_have_side_effect;
 } TfLiteNode;
 #else   // defined(TF_LITE_STATIC_MEMORY)?
@@ -633,90 +649,90 @@ typedef struct TfLiteNode {
   void* builtin_data;
 
   // Custom initial data. This is the opaque data provided in the flatbuffer.
+  //
   // WARNING: This is an experimental interface that is subject to change.
   const void* custom_initial_data;
   int custom_initial_data_size;
 } TfLiteNode;
 #endif  // TF_LITE_STATIC_MEMORY
 
-// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
-// of information required for a kernel to run during TfLiteRegistration::Eval.
+/// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
+/// of information required for a kernel to run during TfLiteRegistration::Eval.
 // TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
 // builds with this flag by default internally.
 typedef struct TfLiteEvalTensor {
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
+  /// A union of data pointers. The appropriate type should be used for a typed
+  /// tensor based on `type`.
   TfLitePtrUnion data;
 
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have.
+  /// A pointer to a structure representing the dimensionality interpretation
+  /// that the buffer should have.
   TfLiteIntArray* dims;
 
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
+  /// The data type specification for data stored in `data`. This affects
+  /// what member of `data` union should be used.
   TfLiteType type;
 } TfLiteEvalTensor;
 
 #ifndef TF_LITE_STATIC_MEMORY
-// Free data memory of tensor `t`.
+/// Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 
-// Free quantization data.
+/// Free quantization data.
 void TfLiteQuantizationFree(TfLiteQuantization* quantization);
 
-// Free sparsity parameters.
+/// Free sparsity parameters.
 void TfLiteSparsityFree(TfLiteSparsity* sparsity);
 
-// Free memory of tensor `t`.
+/// Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 
-// Set all of a tensor's fields (and free any previously allocated data).
+/// Set all of a tensor's fields (and free any previously allocated data).
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
                        TfLiteQuantizationParams quantization, char* buffer,
                        size_t size, TfLiteAllocationType allocation_type,
                        const void* allocation, bool is_variable,
                        TfLiteTensor* tensor);
 
-// Copies the contents of 'src' in 'dst'.
-// Function does nothing if either 'src' or 'dst' is passed as nullptr and
-// return kTfLiteOk.
-// Returns kTfLiteError if 'src' and 'dst' doesn't have matching data size.
-// Note function copies contents, so it won't create new data pointer
-// or change allocation type.
-// All Tensor related properties will be copied from 'src' to 'dst' like
-// quantization, sparsity, ...
+/// Copies the contents of 'src' in 'dst'.
+/// Function does nothing if either 'src' or 'dst' is passed as nullptr and
+/// return kTfLiteOk.
+/// Returns kTfLiteError if 'src' and 'dst' doesn't have matching data size.
+/// Note function copies contents, so it won't create new data pointer
+/// or change allocation type.
+/// All Tensor related properties will be copied from 'src' to 'dst' like
+/// quantization, sparsity, ...
 TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst);
 
-// Change the size of the memory block owned by `tensor` to `num_bytes`.
-// Tensors with allocation types other than `kTfLiteDynamic` will be ignored and
-// a kTfLiteOk will be returned.
-// `tensor`'s internal data buffer will be assigned a pointer
-// which can safely be passed to free or realloc if `num_bytes` is zero.
-// If `preserve_data` is true, tensor data will be unchanged in the range from
-// the start of the region up to the minimum of the old and new sizes. In the
-// case of NULL tensor, or an error allocating new memory, returns
-// `kTfLiteError`.
+/// Change the size of the memory block owned by `tensor` to `num_bytes`.
+/// Tensors with allocation types other than `kTfLiteDynamic` will be ignored
+/// and a kTfLiteOk will be returned. `tensor`'s internal data buffer will be
+/// assigned a pointer which can safely be passed to free or realloc if
+/// `num_bytes` is zero. If `preserve_data` is true, tensor data will be
+/// unchanged in the range from the start of the region up to the minimum of the
+/// old and new sizes. In the case of NULL tensor, or an error allocating new
+/// memory, returns `kTfLiteError`.
 TfLiteStatus TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
                                          bool preserve_data);
 
-// Change the size of the memory block owned by `tensor` to `num_bytes`.
-// Tensors with allocation types other than kTfLiteDynamic will be ignored and
-// a kTfLiteOk will be returned.
-// `tensor`'s internal data buffer will be assigned a pointer
-// which can safely be passed to free or realloc if `num_bytes` is zero.
-// Tensor data will be unchanged in the range from the start of the region up to
-// the minimum of the old and new sizes. In the case
-// of NULL tensor, or an error allocating new memory, returns `kTfLiteError`.
+/// Change the size of the memory block owned by `tensor` to `num_bytes`.
+/// Tensors with allocation types other than kTfLiteDynamic will be ignored and
+/// a kTfLiteOk will be returned.
+/// `tensor`'s internal data buffer will be assigned a pointer
+/// which can safely be passed to free or realloc if `num_bytes` is zero.
+/// Tensor data will be unchanged in the range from the start of the region up
+/// to the minimum of the old and new sizes. In the case of NULL tensor, or an
+/// error allocating new memory, returns `kTfLiteError`.
 TfLiteStatus TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
 #endif  // TF_LITE_STATIC_MEMORY
 
-// WARNING: This is an experimental interface that is subject to change.
-//
-// Currently, TfLiteDelegateParams has to be allocated in a way that it's
-// trivially destructable. It will be stored as `builtin_data` field in
-// `TfLiteNode` of the delegate node.
-//
-// See also the `CreateDelegateParams` function in `interpreter.cc` details.
+/// WARNING: This is an experimental interface that is subject to change.
+///
+/// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+/// trivially destructable. It will be stored as `builtin_data` field in
+/// `TfLiteNode` of the delegate node.
+///
+/// See also the `CreateDelegateParams` function in `interpreter.cc` details.
 typedef struct TfLiteDelegateParams {
   struct TfLiteDelegate* delegate;
   TfLiteIntArray* nodes_to_replace;
@@ -724,14 +740,14 @@ typedef struct TfLiteDelegateParams {
   TfLiteIntArray* output_tensors;
 } TfLiteDelegateParams;
 
-// WARNING: This is an experimental interface that is subject to change.
-//
-// Currently, TfLiteOpaqueDelegateParams has to be allocated in a way that it's
-// trivially destructable. It will be stored as `builtin_data` field in
-// `TfLiteNode` of the delegate node.
-//
-// See also the `CreateOpaqueDelegateParams` function in `subgraph.cc`
-// details.
+/// WARNING: This is an experimental interface that is subject to change.
+///
+/// Currently, TfLiteOpaqueDelegateParams has to be allocated in a way that it's
+/// trivially destructable. It will be stored as `builtin_data` field in
+/// `TfLiteNode` of the delegate node.
+///
+/// See also the `CreateOpaqueDelegateParams` function in `subgraph.cc`
+/// details.
 typedef struct TfLiteOpaqueDelegateParams {
   TfLiteOpaqueDelegate* delegate;
   void* delegate_data;
@@ -741,357 +757,385 @@ typedef struct TfLiteOpaqueDelegateParams {
 } TfLiteOpaqueDelegateParams;
 
 typedef struct TfLiteContext {
-  // Number of tensors in the context.
+  /// Number of tensors in the context.
   size_t tensors_size;
 
-  // The execution plan contains a list of the node indices in execution
-  // order. execution_plan->size is the current number of nodes. And,
-  // execution_plan->data[0] is the first node that needs to be run.
-  // TfLiteDelegates can traverse the current execution plan by iterating
-  // through each member of this array and using GetNodeAndRegistration() to
-  // access details about a node. i.e.
-  //
-  // TfLiteIntArray* execution_plan;
-  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
-  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
-  //    int node_index = execution_plan->data[exec_index];
-  //    TfLiteNode* node;
-  //    TfLiteRegistration* reg;
-  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
-  // }
-  // Note: the memory pointed by '`*execution_plan` is OWNED by TfLite runtime.
-  // Future calls to GetExecutionPlan invalidates earlier outputs. The following
-  // code snippet shows the issue of such an invocation pattern. After calling
-  // CheckNode, subsequent access to `plan_1st` is undefined.
-  //
-  // void CheckNode(const TfLiteNode* node) {
-  //   ...
-  //   TfLiteIntArray* plan_2nd;
-  //   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_2nd));
-  //   ...
-  // }
-  //
-  // TfLiteIntArray* plan_1st;
-  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_1st));
-  // for (int exec_index = 0; exec_index < plan_1st->size; exec_index++) {
-  //    int node_index = plan_1st->data[exec_index];
-  //    TfLiteNode* node;
-  //    TfLiteRegistration* reg;
-  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
-  //    CheckNode(node);
-  // }
-  //
-  // WARNING: This is an experimental interface that is subject to change.
+  /// The execution plan contains a list of the node indices in execution
+  /// order. execution_plan->size is the current number of nodes. And,
+  /// execution_plan->data[0] is the first node that needs to be run.
+  /// TfLiteDelegates can traverse the current execution plan by iterating
+  /// through each member of this array and using GetNodeAndRegistration() to
+  /// access details about a node. i.e.
+  ///
+  ///
+  ///     TfLiteIntArray* execution_plan;
+  ///     TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context,
+  ///     &execution_plan)); for (int exec_index = 0; exec_index <
+  ///     execution_plan->size; exec_index++) {
+  ///        int node_index = execution_plan->data[exec_index];
+  ///        TfLiteNode* node;
+  ///        TfLiteRegistration* reg;
+  ///        context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  ///     }
+  ///
+  /// Note: the memory pointed by '`*execution_plan` is OWNED by TfLite runtime.
+  /// Future calls to GetExecutionPlan invalidates earlier outputs. The
+  /// following code snippet shows the issue of such an invocation pattern.
+  /// After calling CheckNode, subsequent access to `plan_1st` is undefined.
+  ///
+  ///     void CheckNode(const TfLiteNode* node) {
+  ///       ...
+  ///       TfLiteIntArray* plan_2nd;
+  ///       TF_LITE_ENSURE_STATUS(
+  ///           context->GetExecutionPlan(context, &plan_2nd)
+  ///       );
+  ///       ...
+  ///     }
+  ///
+  ///     TfLiteIntArray* plan_1st;
+  ///     TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_1st));
+  ///     for (int exec_index = 0; exec_index < plan_1st->size; exec_index++) {
+  ///        int node_index = plan_1st->data[exec_index];
+  ///        TfLiteNode* node;
+  ///        TfLiteRegistration* reg;
+  ///        context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  ///        CheckNode(node);
+  ///     }
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
                                    TfLiteIntArray** execution_plan);
 
-  // An array of tensors in the interpreter context (of length `tensors_size`)
+  /// An array of tensors in the interpreter context (of length `tensors_size`)
   TfLiteTensor* tensors;
 
-  // opaque full context ptr (an opaque c++ data structure)
+  /// opaque full context ptr (an opaque c++ data structure)
   void* impl_;
 
-  // Request memory pointer be resized. Updates dimensions on the tensor.
-  // NOTE: ResizeTensor takes ownership of newSize.
+  /// Request memory pointer be resized. Updates dimensions on the tensor.
+  /// NOTE: ResizeTensor takes ownership of newSize.
   TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
                                TfLiteIntArray* new_size);
-  // Request that an error be reported with format string msg.
+  /// Request that an error be reported with format string msg.
   void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
 
-  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
-  // non-null, the value pointed to by `first_new_tensor_index` will be set to
-  // the index of the first new tensor.
+  /// Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
+  /// non-null, the value pointed to by `first_new_tensor_index` will be set to
+  /// the index of the first new tensor.
   TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
                              int* first_new_tensor_index);
 
-  // Get a Tensor node by node_index.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Get a Tensor node by node_index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*GetNodeAndRegistration)(
       struct TfLiteContext*, int node_index, TfLiteNode** node,
       struct TfLiteRegistration** registration);
 
-  // Replace ops with one or more stub delegate operations. This function
-  // does not take ownership of `nodes_to_replace`.
+  /// Replace ops with one or more stub delegate operations. This function
+  /// does not take ownership of `nodes_to_replace`.
   TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
       struct TfLiteContext*, struct TfLiteRegistration registration,
       const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
 
-  // Number of threads that are recommended to subsystems like gemmlowp and
-  // eigen.
+  /// Number of threads that are recommended to subsystems like gemmlowp and
+  /// eigen.
   int recommended_num_threads;
 
-  // Access external contexts by type.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Access external contexts by type.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
                                                TfLiteExternalContextType);
-  // Set the value of a external context. Does not take ownership of the
-  // pointer.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Set the value of a external context. Does not take ownership of the
+  /// pointer.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
                              TfLiteExternalContext*);
 
-  // Flag for allowing float16 precision for FP32 calculation.
-  // default: false.
-  // WARNING: This is an experimental API and subject to change.
+  /// Flag for allowing float16 precision for FP32 calculation.
+  /// default: false.
+  ///
+  /// WARNING: This is an experimental API and subject to change.
   bool allow_fp32_relax_to_fp16;
 
-  // Pointer to the op-level profiler, if set; nullptr otherwise.
+  /// Pointer to the op-level profiler, if set; nullptr otherwise.
   void* profiler;
 
-  // Allocate persistent buffer which has the same life time as the interpreter.
-  // Returns nullptr on failure.
-  // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // This method is only available in Init or Prepare stage.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Allocate persistent buffer which has the same life time as the
+  /// interpreter. Returns nullptr on failure. The memory is allocated from heap
+  /// for TFL, and from tail in TFLM. This method is only available in Init or
+  /// Prepare stage.
+  ///
+  /// WARNING: This is an experimental interface that is subject
+  /// to change.
   void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
 
-  // Allocate a buffer which will be deallocated right after invoke phase.
-  // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
-  // This method is only available in invoke stage.
-  // NOTE: If possible use RequestScratchBufferInArena method to avoid memory
-  // allocation during inference time.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Allocate a buffer which will be deallocated right after invoke phase.
+  /// The memory is allocated from heap in TFL, and from volatile arena in TFLM.
+  /// This method is only available in invoke stage.
+  ///
+  /// NOTE: If possible use RequestScratchBufferInArena method to avoid memory
+  /// allocation during inference time.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*AllocateBufferForEval)(struct TfLiteContext* ctx, size_t bytes,
                                         void** ptr);
 
-  // Request a scratch buffer in the arena through static memory planning.
-  // This method is only available in Prepare stage and the buffer is allocated
-  // by the interpreter between Prepare and Eval stage. In Eval stage,
-  // GetScratchBuffer API can be used to fetch the address.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Request a scratch buffer in the arena through static memory planning.
+  /// This method is only available in Prepare stage and the buffer is allocated
+  /// by the interpreter between Prepare and Eval stage. In Eval stage,
+  /// GetScratchBuffer API can be used to fetch the address.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*RequestScratchBufferInArena)(struct TfLiteContext* ctx,
                                               size_t bytes, int* buffer_idx);
 
-  // Get the scratch buffer pointer.
-  // This method is only available in Eval stage.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Get the scratch buffer pointer.
+  /// This method is only available in Eval stage.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   void* (*GetScratchBuffer)(struct TfLiteContext* ctx, int buffer_idx);
 
-  // Resize the memory pointer of the `tensor`. This method behaves the same as
-  // `ResizeTensor`, except that it makes a copy of the shape array internally
-  // so the shape array could be deallocated right afterwards.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Resize the memory pointer of the `tensor`. This method behaves the same as
+  /// `ResizeTensor`, except that it makes a copy of the shape array internally
+  /// so the shape array could be deallocated right afterwards.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*ResizeTensorExplicit)(struct TfLiteContext* ctx,
                                        TfLiteTensor* tensor, int dims,
                                        const int* shape);
 
-  // This method provides a preview of post-delegation partitioning. Each
-  // TfLiteDelegateParams in the referenced array corresponds to one instance of
-  // the delegate kernel.
-  // Example usage:
-  //
-  // TfLiteIntArray* nodes_to_replace = ...;
-  // TfLiteDelegateParams* params_array;
-  // int num_partitions = 0;
-  // TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
-  //    context, delegate, nodes_to_replace, &params_array, &num_partitions));
-  // for (int idx = 0; idx < num_partitions; idx++) {
-  //    const auto& partition_params = params_array[idx];
-  //    ...
-  // }
-  //
-  // NOTE: The context owns the memory referenced by partition_params_array. It
-  // will be cleared with another call to PreviewDelegatePartitioning, or after
-  // TfLiteDelegateParams::Prepare returns.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
+  /// This method provides a preview of post-delegation partitioning. Each
+  /// TfLiteDelegateParams in the referenced array corresponds to one instance
+  /// of the delegate kernel. Example usage:
+  ///
+  /// TfLiteIntArray* nodes_to_replace = ...;
+  /// TfLiteDelegateParams* params_array;
+  /// int num_partitions = 0;
+  /// TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+  ///    context, delegate, nodes_to_replace, &params_array, &num_partitions));
+  /// for (int idx = 0; idx < num_partitions; idx++) {
+  ///    const auto& partition_params = params_array[idx];
+  ///    ...
+  /// }
+  ///
+  /// NOTE: The context owns the memory referenced by partition_params_array. It
+  /// will be cleared with another call to PreviewDelegatePartitioning, or after
+  /// TfLiteDelegateParams::Prepare returns.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*PreviewDelegatePartitioning)(
       struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
       TfLiteDelegateParams** partition_params_array, int* num_partitions);
 
-  // Returns a TfLiteTensor struct for a given index.
-  // WARNING: This is an experimental interface that is subject to change.
-  // WARNING: This method may not be available on all platforms.
+  /// Returns a TfLiteTensor struct for a given index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  ///
+  /// WARNING: This method may not be available on all platforms.
   TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
                              int tensor_idx);
 
-  // Returns a TfLiteEvalTensor struct for a given index.
-  // WARNING: This is an experimental interface that is subject to change.
-  // WARNING: This method may not be available on all platforms.
+  /// Returns a TfLiteEvalTensor struct for a given index.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
+  ///
+  /// WARNING: This method may not be available on all platforms.
   TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
                                      int tensor_idx);
 
-  // Retrieves named metadata buffer from the TFLite model.
-  // Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
-  // Model: that is, there exists a `metadata` entry with given `name` string.
-  // (see TFLite's schema.fbs).
-  // The corresponding `buffer` information is populated in `ptr` & `bytes`.
-  // The data from `ptr` is valid for the lifetime of the Interpreter.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Retrieves named metadata buffer from the TFLite model.
+  /// Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
+  /// Model: that is, there exists a `metadata` entry with given `name` string.
+  /// (see TFLite's schema.fbs).
+  /// The corresponding `buffer` information is populated in `ptr` & `bytes`.
+  /// The data from `ptr` is valid for the lifetime of the Interpreter.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
                                    const char* name, const char** ptr,
                                    size_t* bytes);
 
-  // Retrieves the corresponding TfLiteContext of a subgraph that the given
-  // subgraph_index points to and switches to the delegate context for that
-  // subgraph. If an invalid subgraph index is given, returns kTfLiteError.
-  // NOTE: This function is expected to be paired with ReleaseSubgraphContext()
-  // once the delegate preparation is done and/or the delegate context functions
-  // are no longer needed.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Retrieves the corresponding TfLiteContext of a subgraph that the given
+  /// subgraph_index points to and switches to the delegate context for that
+  /// subgraph. If an invalid subgraph index is given, returns kTfLiteError.
+  ///
+  /// NOTE: This function is expected to be paired with ReleaseSubgraphContext()
+  /// once the delegate preparation is done and/or the delegate context
+  /// functions are no longer needed.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*AcquireSubgraphContext)(
       struct TfLiteContext* context, int subgraph_index,
       struct TfLiteContext** acquired_context);
-  // Releases the subgraph context by switching back to the TFLite kernel
-  // context for the subgraph that the given subgraph_index points to.
-  // NOTE: This function is expected to be used after AcquireSubgraphContext()
-  // once the delegate preparation is done and/or the delegate context functions
-  // are no longer needed.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Releases the subgraph context by switching back to the TFLite kernel
+  /// context for the subgraph that the given subgraph_index points to.
+  ///
+  /// NOTE: This function is expected to be used after AcquireSubgraphContext()
+  /// once the delegate preparation is done and/or the delegate context
+  /// functions are no longer needed.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus (*ReleaseSubgraphContext)(struct TfLiteContext* context,
                                          int subgraph_index);
 } TfLiteContext;
 
-// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
-// for C API which doesn't use internal types (such as `TfLiteContext`) but only
-// uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
-// field is the exactly the same as with `TfLiteRegistration`.
+/// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+/// for C API which doesn't use internal types (such as `TfLiteContext`) but
+/// only uses stable API types (such as `TfLiteOpaqueContext`). The purpose of
+/// each field is the exactly the same as with `TfLiteRegistration`.
 typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
 
-// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
-// This allow an op to signal to the runtime that the same data pointer
-// may be passed as an input and output without impacting the result.
-// This does not mean that the memory can safely be reused, it is up to the
-// runtime to determine this, e.g. if another op consumes the same input or not
-// or if an input tensor has sufficient memory allocated to store the output
-// data.
-//
-// Setting these flags authorizes the runtime to set the data pointers of an
-// input and output tensor to the same value. In such cases, the memory required
-// by the output must be less than or equal to that required by the shared
-// input, never greater. If kTfLiteInplaceOpDataUnmodified is set, then the
-// runtime can share the same input tensor with multiple operator's outputs,
-// provided that kTfLiteInplaceOpDataUnmodified is set for all of them.
-// Otherwise, if an input tensor is consumed by multiple operators, it may only
-// be shared with the operator which is the last to consume it.
-//
-// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+/// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
+/// This allow an op to signal to the runtime that the same data pointer
+/// may be passed as an input and output without impacting the result.
+/// This does not mean that the memory can safely be reused, it is up to the
+/// runtime to determine this, e.g. if another op consumes the same input or not
+/// or if an input tensor has sufficient memory allocated to store the output
+/// data.
+///
+/// Setting these flags authorizes the runtime to set the data pointers of an
+/// input and output tensor to the same value. In such cases, the memory
+/// required by the output must be less than or equal to that required by the
+/// shared input, never greater. If kTfLiteInplaceOpDataUnmodified is set, then
+/// the runtime can share the same input tensor with multiple operator's
+/// outputs, provided that kTfLiteInplaceOpDataUnmodified is set for all of
+/// them. Otherwise, if an input tensor is consumed by multiple operators, it
+/// may only be shared with the operator which is the last to consume it.
+///
+/// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
 typedef enum {
-  // The default value. This indicates that the same data pointer cannot safely
-  // be passed as an op's input and output.
+  /// The default value. This indicates that the same data pointer cannot safely
+  /// be passed as an op's input and output.
   kTfLiteInplaceOpNone = 0,
-  // This indicates that an op's first output's data is identical to its first
-  // input's data, for example Reshape.
+  /// This indicates that an op's first output's data is identical to its first
+  /// input's data, for example Reshape.
   kTfLiteInplaceOpDataUnmodified = 1,
-  // Setting kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput means
-  // that InputN may be shared with OutputN instead of with the first output.
-  // This flag requires one or more of kTfLiteInplaceOpInputNShared to be set.
+  /// Setting kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput means
+  /// that InputN may be shared with OutputN instead of with the first output.
+  /// This flag requires one or more of kTfLiteInplaceOpInputNShared to be set.
   kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput = 2,
-  // kTfLiteInplaceOpInputNShared indicates that it is safe for an op to share
-  // InputN's data pointer with an output tensor. If
-  // kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is set then
-  // kTfLiteInplaceOpInputNShared indicates that InputN may be shared
-  // with OutputN, otherwise kTfLiteInplaceOpInputNShared indicates that InputN
-  // may be shared with the first output.
-  //
-  // Indicates that an op's first input may be shared with the first output
-  // tensor. kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput has
-  // no impact on the behavior allowed by this flag.
+  /// kTfLiteInplaceOpInputNShared indicates that it is safe for an op to share
+  /// InputN's data pointer with an output tensor. If
+  /// kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is set then
+  /// kTfLiteInplaceOpInputNShared indicates that InputN may be shared
+  /// with OutputN, otherwise kTfLiteInplaceOpInputNShared indicates that InputN
+  /// may be shared with the first output.
+  ///
+  /// Indicates that an op's first input may be shared with the first output
+  /// tensor. kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput has
+  /// no impact on the behavior allowed by this flag.
   kTfLiteInplaceOpInput0Shared = 4,
-  // Indicates that an op's second input may be shared with the first output
-  // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
-  // or second output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
-  // is set.
+  /// Indicates that an op's second input may be shared with the first output
+  /// if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  /// or second output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
+  /// is set.
   kTfLiteInplaceOpInput1Shared = 8,
-  // Indicates that an op's third input may be shared with the first output
-  // if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
-  // or third output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is
-  // set.
+  /// Indicates that an op's third input may be shared with the first output
+  /// if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput is not set
+  /// or third output if kTfLiteInplaceInputCanBeSharedWithCorrespondingOutput
+  /// is
+  /// set.
   kTfLiteInplaceOpInput2Shared = 16,
-  // Placeholder to ensure that enum can hold 64 bit values to accommodate
-  // future fields.
+  /// Placeholder to ensure that enum can hold 64 bit values to accommodate
+  /// future fields.
   kTfLiteInplaceOpMaxValue = UINT64_MAX,
 } TfLiteInPlaceOp;
 
-// The number of shareable inputs supported.
+/// The number of shareable inputs supported.
 static const int kTfLiteMaxSharableOpInputs = 3;
 
 typedef struct TfLiteRegistration {
-  // Initializes the op from serialized data.
-  // Called only *once* for the lifetime of the op, so any one-time allocations
-  // should be made here (unless they depend on tensor sizes).
-  //
-  // If a built-in op:
-  //   `buffer` is the op's params data (TfLiteLSTMParams*).
-  //   `length` is zero.
-  // If custom op:
-  //   `buffer` is the op's `custom_options`.
-  //   `length` is the size of the buffer.
-  //
-  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
-  // or an instance of a struct).
-  //
-  // The returned pointer will be stored with the node in the `user_data` field,
-  // accessible within prepare and invoke functions below.
-  // NOTE: if the data is already in the desired format, simply implement this
-  // function to return `nullptr` and implement the free function to be a no-op.
+  /// Initializes the op from serialized data.
+  /// Called only *once* for the lifetime of the op, so any one-time allocations
+  /// should be made here (unless they depend on tensor sizes).
+  ///
+  /// If a built-in op:
+  ///   `buffer` is the op's params data (TfLiteLSTMParams*).
+  ///   `length` is zero.
+  /// If custom op:
+  ///   `buffer` is the op's `custom_options`.
+  ///   `length` is the size of the buffer.
+  ///
+  /// Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
+  /// or an instance of a struct).
+  ///
+  /// The returned pointer will be stored with the node in the `user_data`
+  /// field, accessible within prepare and invoke functions below.
+  ///
+  /// NOTE: if the data is already in the desired format, simply implement this
+  /// function to return `nullptr` and implement the free function to be a
+  /// no-op.
   void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
 
-  // The pointer `buffer` is the data previously returned by an init invocation.
+  /// The pointer `buffer` is the data previously returned by an init
+  /// invocation.
   void (*free)(TfLiteContext* context, void* buffer);
 
-  // prepare is called when the inputs this node depends on have been resized.
-  // context->ResizeTensor() can be called to request output tensors to be
-  // resized.
-  // Can be called multiple times for the lifetime of the op.
-  //
-  // Returns kTfLiteOk on success.
+  /// prepare is called when the inputs this node depends on have been resized.
+  /// context->ResizeTensor() can be called to request output tensors to be
+  /// resized.
+  /// Can be called multiple times for the lifetime of the op.
+  ///
+  /// Returns kTfLiteOk on success.
   TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
 
-  // Execute the node (should read node->inputs and output to node->outputs).
-  // Returns kTfLiteOk on success.
+  /// Execute the node (should read node->inputs and output to node->outputs).
+  /// Returns kTfLiteOk on success.
   TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
 
-  // profiling_string is called during summarization of profiling information
-  // in order to group executions together. Providing a value here will cause a
-  // given op to appear multiple times is the profiling report. This is
-  // particularly useful for custom ops that can perform significantly
-  // different calculations depending on their `user-data`.
+  /// profiling_string is called during summarization of profiling information
+  /// in order to group executions together. Providing a value here will cause a
+  /// given op to appear multiple times is the profiling report. This is
+  /// particularly useful for custom ops that can perform significantly
+  /// different calculations depending on their `user-data`.
   const char* (*profiling_string)(const TfLiteContext* context,
                                   const TfLiteNode* node);
 
-  // Builtin codes. If this kernel refers to a builtin this is the code
-  // of the builtin. This is so we can do marshaling to other frameworks like
-  // NN API.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
+  /// Builtin codes. If this kernel refers to a builtin this is the code
+  /// of the builtin. This is so we can do marshaling to other frameworks like
+  /// NN API.
+  ///
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
   int32_t builtin_code;
 
-  // Custom op name. If the op is a builtin, this will be null.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  // WARNING: This is an experimental interface that is subject to change.
+  /// Custom op name. If the op is a builtin, this will be null.
+  ///
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
+  ///
+  /// WARNING: This is an experimental interface that is subject to change.
   const char* custom_name;
 
-  // The version of the op.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
+  /// The version of the op.
+  /// Note: It is the responsibility of the registration binder to set this
+  /// properly.
   int version;
 
-  // The external version of `TfLiteRegistration`. Since we can't use internal
-  // types (such as `TfLiteContext`) for C API to maintain ABI stability.
-  // C API user will provide `TfLiteRegistrationExternal` to implement custom
-  // ops. We keep it inside of `TfLiteRegistration` and use it to route
-  // callbacks properly.
+  /// The external version of `TfLiteRegistration`. Since we can't use internal
+  /// types (such as `TfLiteContext`) for C API to maintain ABI stability.
+  /// C API user will provide `TfLiteRegistrationExternal` to implement custom
+  /// ops. We keep it inside of `TfLiteRegistration` and use it to route
+  /// callbacks properly.
   TfLiteRegistrationExternal* registration_external;
 
-  // Retrieves asynchronous kernel.
-  //
-  // If the `async_kernel` field is nullptr, it means the operation described by
-  // this TfLiteRegistration object does not support asynchronous execution.
-  // Otherwise, the function that the field points to should only be called for
-  // delegate kernel nodes, i.e. `node` should be a delegate kernel node created
-  // by applying a delegate.
-  // If the function returns nullptr, that means that the underlying delegate
-  // does not support asynchronous execution for this `node`.
+  /// Retrieves asynchronous kernel.
+  ///
+  /// If the `async_kernel` field is nullptr, it means the operation described
+  /// by this TfLiteRegistration object does not support asynchronous execution.
+  /// Otherwise, the function that the field points to should only be called for
+  /// delegate kernel nodes, i.e. `node` should be a delegate kernel node
+  /// created by applying a delegate. If the function returns nullptr, that
+  /// means that the underlying delegate does not support asynchronous execution
+  /// for this `node`.
   struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
                                             TfLiteNode* node);
 
-  // Indicates if an operator's output may safely overwrite its inputs.
-  // See the comments in `TfLiteInPlaceOp`.
+  /// Indicates if an operator's output may safely overwrite its inputs.
+  /// See the comments in `TfLiteInPlaceOp`.
   uint64_t inplace_operator;
 } TfLiteRegistration;
 
@@ -1103,6 +1147,7 @@ typedef struct TfLiteRegistration {
 // field in TfLiteRegistration that is not present in the legacy registration
 // type must be greater than or equal to the size of the legacy registration
 // type.
+//
 // WARNING: This structure is deprecated / not an official part of the
 // API. It should be only used for binary backward compatibility.
 typedef struct TfLiteRegistration_V3 {
@@ -1128,6 +1173,7 @@ typedef struct TfLiteRegistration_V3 {
 // field in TfLiteRegistration that is not present in the legacy registration
 // type must be greater than or equal to the size of the legacy registration
 // type.
+//
 // WARNING: This structure is deprecated / not an official part of the
 // API. It should be only used for binary backward compatibility.
 typedef struct TfLiteRegistration_V2 {
@@ -1151,6 +1197,7 @@ typedef struct TfLiteRegistration_V2 {
 // field in TfLiteRegistration that is not present in the legacy registration
 // type must be greater than or equal to the size of the legacy registration
 // type.
+//
 // WARNING: This structure is deprecated / not an official part of the
 // API. It should be only used for binary backward compatibility.
 typedef struct TfLiteRegistration_V1 {
@@ -1165,207 +1212,212 @@ typedef struct TfLiteRegistration_V1 {
   int version;
 } TfLiteRegistration_V1;
 
-// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
-// values should be 1, 2, 4, 8, ...etc.
+/// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
+/// values should be 1, 2, 4, 8, ...etc.
 typedef enum TfLiteDelegateFlags {
   kTfLiteDelegateFlagsNone = 0,
-  // The flag is set if the delegate can handle dynamic sized tensors.
-  // For example, the output shape of a `Resize` op with non-constant shape
-  // can only be inferred when the op is invoked.
-  // In this case, the Delegate is responsible for calling
-  // `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
-  // `ResizeTensor` when invoking the op.
-  //
-  // If the delegate isn't capable to handle dynamic tensors, this flag need
-  // to be set to false.
+  /// The flag is set if the delegate can handle dynamic sized tensors.
+  /// For example, the output shape of a `Resize` op with non-constant shape
+  /// can only be inferred when the op is invoked.
+  /// In this case, the Delegate is responsible for calling
+  /// `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
+  /// `ResizeTensor` when invoking the op.
+  ///
+  /// If the delegate isn't capable to handle dynamic tensors, this flag need
+  /// to be set to false.
   kTfLiteDelegateFlagsAllowDynamicTensors = 1,
 
-  // This flag can be used by delegates (that allow dynamic tensors) to ensure
-  // applicable tensor shapes are automatically propagated in the case of tensor
-  // resizing.
-  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
-  // of a delegate kernel will have correct shapes before its Prepare() method
-  // is called. The runtime leverages TFLite builtin ops in the original
-  // execution plan to propagate shapes.
-  //
-  // A few points to note:
-  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
-  // false, this one is redundant since the delegate kernels are re-initialized
-  // every time tensors are resized.
-  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
-  // work is required to prepare the original execution plan.
-  // 3. This flag requires that the original execution plan only have ops with
-  // valid registrations (and not 'dummy' custom ops like with Flex).
-  // WARNING: This feature is experimental and subject to change.
+  /// This flag can be used by delegates (that allow dynamic tensors) to ensure
+  /// applicable tensor shapes are automatically propagated in the case of
+  /// tensor resizing. This means that non-dynamic (allocation_type !=
+  /// kTfLiteDynamic) I/O tensors of a delegate kernel will have correct shapes
+  /// before its Prepare() method is called. The runtime leverages TFLite
+  /// builtin ops in the original execution plan to propagate shapes.
+  ///
+  /// A few points to note:
+  /// 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  /// false, this one is redundant since the delegate kernels are re-initialized
+  /// every time tensors are resized.
+  /// 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  /// work is required to prepare the original execution plan.
+  /// 3. This flag requires that the original execution plan only have ops with
+  /// valid registrations (and not 'dummy' custom ops like with Flex).
+  ///
+  /// WARNING: This feature is experimental and subject to change.
   kTfLiteDelegateFlagsRequirePropagatedShapes = 2,
 
-  // This flag can be used by delegates to request per-operator profiling. If a
-  // node is a delegate node, this flag will be checked before profiling. If
-  // set, then the node will not be profiled. The delegate will then add per
-  // operator information using Profiler::EventType::OPERATOR_INVOKE_EVENT and
-  // the results will appear in the operator-wise Profiling section and not in
-  // the Delegate internal section.
+  /// This flag can be used by delegates to request per-operator profiling. If a
+  /// node is a delegate node, this flag will be checked before profiling. If
+  /// set, then the node will not be profiled. The delegate will then add per
+  /// operator information using Profiler::EventType::OPERATOR_INVOKE_EVENT and
+  /// the results will appear in the operator-wise Profiling section and not in
+  /// the Delegate internal section.
   kTfLiteDelegateFlagsPerOperatorProfiling = 4
 } TfLiteDelegateFlags;
 
-// WARNING: This is an experimental interface that is subject to change.
+/// WARNING: This is an experimental interface that is subject to change.
 typedef struct TfLiteDelegate {
-  // Data that delegate needs to identify itself. This data is owned by the
-  // delegate. The delegate is owned in the user code, so the delegate is
-  // responsible for deallocating this when it is destroyed.
+  /// Data that delegate needs to identify itself. This data is owned by the
+  /// delegate. The delegate is owned in the user code, so the delegate is
+  /// responsible for deallocating this when it is destroyed.
   void* data_;
 
-  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
-  // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
-  // to ask the TensorFlow lite runtime to create macro-nodes to represent
-  // delegated subgraphs of the original graph.
+  /// Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  /// delegate a view of the current graph through TfLiteContext*. It typically
+  /// will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
+  /// to ask the TensorFlow lite runtime to create macro-nodes to represent
+  /// delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteContext* context,
                           struct TfLiteDelegate* delegate);
 
-  // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
-  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
-  // cannot be null.
+  /// Copy the data from delegate buffer handle into raw memory of the given
+  /// 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for kTfLiteDynamic tensors, in which case
+  /// this cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
                                        TfLiteTensor* tensor);
 
-  // Copy the data from raw memory of the given 'tensor' to delegate buffer
-  // handle. This can be null if the delegate doesn't use its own buffer.
+  /// Copy the data from raw memory of the given 'tensor' to delegate buffer
+  /// handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
                                      struct TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
                                      TfLiteTensor* tensor);
 
-  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
-  // this doesn't release the underlying resource (e.g. textures). The
-  // resources are either owned by application layer or the delegate.
-  // This can be null if the delegate doesn't use its own buffer.
+  /// Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  /// This can be null if the delegate doesn't use its own buffer.
   void (*FreeBufferHandle)(TfLiteContext* context,
                            struct TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
 
-  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  /// Bitmask flags. See the comments in `TfLiteDelegateFlags`.
   int64_t flags;
 
-  // The opaque delegate builder associated with this object.  If set then the
-  // TF Lite runtime will give precedence to this field.  E.g. instead of
-  // invoking 'Prepare' via the function pointer inside the 'TfLiteDelegate'
-  // object, the runtime will first check if the corresponding function
-  // pointer inside 'opaque_delegate_builder' is set and if so invoke that.
-  //
-  // If this field is non-null, then the 'Prepare' field (of the
-  // 'TfLiteDelegate') should be null.
+  /// The opaque delegate builder associated with this object.  If set then the
+  /// TF Lite runtime will give precedence to this field.  E.g. instead of
+  /// invoking 'Prepare' via the function pointer inside the 'TfLiteDelegate'
+  /// object, the runtime will first check if the corresponding function
+  /// pointer inside 'opaque_delegate_builder' is set and if so invoke that.
+  ///
+  /// If this field is non-null, then the 'Prepare' field (of the
+  /// 'TfLiteDelegate') should be null.
   struct TfLiteOpaqueDelegateBuilder* opaque_delegate_builder;
 } TfLiteDelegate;
 
-// Build a 'null' delegate, with all the fields properly set to their default
-// values.
+/// Build a 'null' delegate, with all the fields properly set to their default
+/// values.
 TfLiteDelegate TfLiteDelegateCreate(void);
 
-// `TfLiteOpaqueDelegateBuilder` is used for constructing
-// `TfLiteOpaqueDelegate`, see `TfLiteOpaqueDelegateCreate` below.  Note:
-// This struct is not ABI stable.
-//
-// For forward source compatibility `TfLiteOpaqueDelegateBuilder` objects should
-// be brace-initialized, so that all fields (including any that might be added
-// in the future) get zero-initialized.  The purpose of each field is exactly
-// the same as with `TfLiteDelegate`.
-//
-// WARNING: This is an experimental interface that is subject to change.
+/// `TfLiteOpaqueDelegateBuilder` is used for constructing
+/// `TfLiteOpaqueDelegate`, see `TfLiteOpaqueDelegateCreate` below.  Note:
+/// This struct is not ABI stable.
+///
+/// For forward source compatibility `TfLiteOpaqueDelegateBuilder` objects
+/// should be brace-initialized, so that all fields (including any that might be
+/// added in the future) get zero-initialized.  The purpose of each field is
+/// exactly the same as with `TfLiteDelegate`.
+///
+/// WARNING: This is an experimental interface that is subject to change.
 typedef struct TfLiteOpaqueDelegateBuilder {
-  // Data that delegate needs to identify itself. This data is owned by the
-  // delegate. The delegate is owned in the user code, so the delegate is
-  // responsible for deallocating this when it is destroyed.
+  /// Data that delegate needs to identify itself. This data is owned by the
+  /// delegate. The delegate is owned in the user code, so the delegate is
+  /// responsible for deallocating this when it is destroyed.
   void* data;
-  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
-  // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
-  // to ask the TensorFlow lite runtime to create macro-nodes to represent
-  // delegated subgraphs of the original graph.
+  /// Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  /// delegate a view of the current graph through TfLiteContext*. It typically
+  /// will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
+  /// to ask the TensorFlow lite runtime to create macro-nodes to represent
+  /// delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteOpaqueContext* context,  // NOLINT
                           TfLiteOpaqueDelegate* delegate, void* data);
-  // Copies the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
-  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
-  // cannot be null.
+  /// Copies the data from delegate buffer handle into raw memory of the given
+  /// 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  /// long as it follows the rules for kTfLiteDynamic tensors, in which case
+  /// this cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(  // NOLINT
       TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
       TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
-  // Copies the data from raw memory of the given 'tensor' to delegate buffer
-  // handle. This can be null if the delegate doesn't use its own buffer.
+  /// Copies the data from raw memory of the given 'tensor' to delegate buffer
+  /// handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(  // NOLINT
       TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
       TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
-  // Frees the Delegate Buffer Handle. Note: This only frees the handle, but
-  // this doesn't release the underlying resource (e.g. textures). The
-  // resources are either owned by application layer or the delegate.
-  // This can be null if the delegate doesn't use its own buffer.
+  /// Frees the Delegate Buffer Handle. Note: This only frees the handle, but
+  /// this doesn't release the underlying resource (e.g. textures). The
+  /// resources are either owned by application layer or the delegate.
+  /// This can be null if the delegate doesn't use its own buffer.
   void (*FreeBufferHandle)(TfLiteOpaqueContext* context,  // NOLINT
                            TfLiteOpaqueDelegate* delegate, void* data,
                            TfLiteBufferHandle* handle);
-  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  /// Bitmask flags. See the comments in `TfLiteDelegateFlags`.
   int64_t flags;
 } TfLiteOpaqueDelegateBuilder;
 
 #ifndef TF_LITE_STATIC_MEMORY
-// Creates an opaque delegate and returns its address.  The opaque delegate will
-// behave according to the provided 'opaque_delegate_builder'.  The lifetime of
-// the objects pointed to by any of the fields within the
-// 'opaque_delegate_builder' must outlive the returned
-// 'TfLiteOpaqueDelegate' and any 'TfLiteInterpreter',
-// 'TfLiteInterpreterOptions', 'tflite::Interpreter', or
-// 'tflite::InterpreterBuilder' that the delegate is added to.  The returned
-// address should be passed to 'TfLiteOpaqueDelegateDelete' for deletion.  If
-// 'opaque_delegate_builder' is a null pointer, then a null pointer will be
-// returned.
+/// Creates an opaque delegate and returns its address.  The opaque delegate
+/// will behave according to the provided 'opaque_delegate_builder'.  The
+/// lifetime of the objects pointed to by any of the fields within the
+/// 'opaque_delegate_builder' must outlive the returned
+/// 'TfLiteOpaqueDelegate' and any 'TfLiteInterpreter',
+/// 'TfLiteInterpreterOptions', 'tflite::Interpreter', or
+/// 'tflite::InterpreterBuilder' that the delegate is added to.  The returned
+/// address should be passed to 'TfLiteOpaqueDelegateDelete' for deletion.  If
+/// 'opaque_delegate_builder' is a null pointer, then a null pointer will be
+/// returned.
 TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
     const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder);
 
-// Deletes the provided opaque 'delegate'.  This function has no effect if the
-// 'delegate' is a null pointer.
+/// Deletes the provided opaque 'delegate'.  This function has no effect if the
+/// 'delegate' is a null pointer.
 void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
 #endif  // TF_LITE_STATIC_MEMORY
 
-// Returns a pointer to the data associated with the provided opaque 'delegate'.
-//
-// A null pointer will be returned when:
-// - The 'delegate' is null.
-// - The 'data' field of the 'TfLiteOpaqueDelegateBuilder' used to construct the
-//   'delegate' was null.
-// - Or in case of any other error.
-// - The 'delegate' has been constructed via a 'TfLiteOpaqueDelegateBuilder',
-//   but the 'data' field of the 'TfLiteOpaqueDelegateBuilder' is null.
-//
-//  The data_ field of 'delegate' will be returned if the
-//  'opaque_delegate_builder' field is null.
+/// Returns a pointer to the data associated with the provided opaque
+/// 'delegate'.
+///
+/// A null pointer will be returned when:
+/// - The 'delegate' is null.
+/// - The 'data' field of the 'TfLiteOpaqueDelegateBuilder' used to construct
+///   the 'delegate' was null.
+/// - Or in case of any other error.
+/// - The 'delegate' has been constructed via a 'TfLiteOpaqueDelegateBuilder',
+///   but the 'data' field of the 'TfLiteOpaqueDelegateBuilder' is null.
+///
+///  The data_ field of 'delegate' will be returned if the
+///  'opaque_delegate_builder' field is null.
 void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
 
-// Returns a tensor data allocation strategy.
+/// Returns a tensor data allocation strategy.
 TfLiteAllocationStrategy TfLiteTensorGetAllocationStrategy(
     const TfLiteTensor* t);
 
-// Returns how stable a tensor data buffer address is across runs.
+/// Returns how stable a tensor data buffer address is across runs.
 TfLiteRunStability TfLiteTensorGetBufferAddressStability(const TfLiteTensor* t);
 
-// Returns how stable a tensor data values are across runs.
+/// Returns how stable a tensor data values are across runs.
 TfLiteRunStability TfLiteTensorGetDataStability(const TfLiteTensor* t);
 
-// Returns the operation step when the data of a tensor is populated.
-//
-// Some operations can precompute their results before the evaluation step. This
-// makes the data available earlier for subsequent operations.
+/// Returns the operation step when the data of a tensor is populated.
+///
+/// Some operations can precompute their results before the evaluation step.
+/// This makes the data available earlier for subsequent operations.
 TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
 
-// Returns the operation steop when the shape of a tensor is computed.
-//
-// Some operations can precompute the shape of their results before the
-// evaluation step. This makes the shape available earlier for subsequent
-// operations.
+/// Returns the operation steop when the shape of a tensor is computed.
+///
+/// Some operations can precompute the shape of their results before the
+/// evaluation step. This makes the shape available earlier for subsequent
+/// operations.
 TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
 
+/** @} */
+// Ends `\addtogroup`, it's important for the doc generator that this doesn't
+// include the CC code below.
+
 #ifdef __cplusplus
 }  // extern "C"
 
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index 594e83c5d96284..a3b1d77e28aacd 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -200,15 +200,15 @@ TfLiteRegistration* Register_STABLEHLO_RNG_BIT_GENERATOR();
 TfLiteRegistration*
 Register_STABLEHLO_LOGISTIC();  // WARNING: not implemented, using this op will
                                 // crash the runtime
-TfLiteRegistration*
-Register_STABLEHLO_ADD();  // WARNING: not implemented, using this op will crash
-                           // the runtime
+
+TfLiteRegistration* Register_STABLEHLO_ADD();
+
 TfLiteRegistration*
 Register_STABLEHLO_DIVIDE();  // WARNING: not implemented, using this op will
                               // crash the runtime
-TfLiteRegistration*
-Register_STABLEHLO_MULTIPLY();  // WARNING: not implemented, using this op will
-                                // crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_MULTIPLY();
+
 TfLiteRegistration*
 Register_STABLEHLO_MAXIMUM();  // WARNING: not implemented, using this op will
                                // crash the runtime
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index da24c1116417b6..c07c997487aed4 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -374,6 +374,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_STABLEHLO_RNG_BIT_GENERATOR());
   AddBuiltin(BuiltinOperator_REDUCE_WINDOW, Register_REDUCE_WINDOW());
   AddBuiltin(BuiltinOperator_STABLEHLO_GATHER, Register_STABLEHLO_GATHER());
+  AddBuiltin(BuiltinOperator_STABLEHLO_ADD, Register_STABLEHLO_ADD());
+  AddBuiltin(BuiltinOperator_STABLEHLO_MULTIPLY, Register_STABLEHLO_MULTIPLY());
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index 0c4969aefabfa2..05822832cc2f93 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -243,15 +243,13 @@ void FlatBufferModel::ValidateModelBuffers(ErrorReporter* error_reporter) {
   auto buffers = model_->buffers();
   if (buffers && buffers->size() > 0) {
     auto first_buffer = buffers->Get(0);
-    if (first_buffer && first_buffer->data()) {
-      if (first_buffer->data()->size() != 0) {
-        // Note the 0th entry of this array must be an empty buffer (sentinel).
-        // This is a convention so that tensors without a buffer can provide 0
-        // as their buffer.
-        TF_LITE_REPORT_ERROR(
-            error_reporter,
-            "The 0th entry of the model buffer must be an empty buffer.");
-      }
+    if (first_buffer && first_buffer->size() != 0) {
+      // Note the 0th entry of this array must be an empty buffer (sentinel).
+      // This is a convention so that tensors without a buffer can provide 0
+      // as their buffer.
+      TF_LITE_REPORT_ERROR(
+          error_reporter,
+          "The 0th entry of the model buffer must be an empty buffer.");
     }
   }
 }
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index e0bb3f4dba63c9..238b6adadd824c 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -8,9 +8,6 @@ load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/android:__subpackages__",
-    ],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc b/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
index 630290f6bc3260..0c9d3ad866ba44 100644
--- a/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
@@ -49,7 +49,7 @@ std::string ProfilingInfo::GetDetailedReport() const {
           dispatch.read_mem_size + dispatch.write_mem_size;
       const double giga_bytes = total_size / 1024.0 / 1024.0 / 1024.0;
       const double giga_bytes_per_sec = times_per_sec * giga_bytes;
-      result += ", " + std::to_string(giga_bytes_per_sec) + " Gb/s";
+      result += ", " + std::to_string(giga_bytes_per_sec) + " Gbyte/s";
     }
     if (dispatch.flops) {
       const double giga_flops = dispatch.flops / 1000.0 / 1000.0 / 1000.0;
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index f892521e294841..6ebd63d0c909c4 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -236,6 +236,7 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@XNNPACK",
+        "@XNNPACK//:experiments_config",
     ],
 )
 
@@ -270,6 +271,7 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@XNNPACK//:XNNPACK_test_mode",
+        "@XNNPACK//:experiments_config",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index cab06da2807b8d..5654c285c8d150 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -816,5 +816,40 @@ TEST(Conv2D, TransientIndirectionBuffer) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(Conv2D, AdaptiveAvxOptimization) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.experimental_adaptive_avx_optimization = true;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .Test(xnnpack_delegate.get());
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 2507a02d398bbd..e9b95752334718 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "experiments-config.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/api/profiler.h"
@@ -555,6 +556,10 @@ class Delegate {
             TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER) != 0;
   }
 
+  bool experimental_adaptive_avx_optimization() const {
+    return options_.experimental_adaptive_avx_optimization;
+  }
+
   pthreadpool_t threadpool() const {
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
     return nullptr;
@@ -1017,6 +1022,9 @@ class Subgraph {
     if (delegate.transient_indirection_buffer()) {
       flags |= XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER;
     }
+    if (delegate.experimental_adaptive_avx_optimization()) {
+      xnn_experiment_enable_adaptive_avx_optimization();
+    }
     if (delegate.force_fp16()) {
       flags |= XNN_FLAG_FORCE_FP16_INFERENCE;
     } else {
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index 1154528eefb847..f6cf417a03d0da 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -64,6 +64,8 @@ typedef struct {
   // Deprecated. Use the flags bitfield with the
   // TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS mask.
   bool handle_variable_ops;
+  // Enable adaptive optimization for AVX CPUs.
+  bool experimental_adaptive_avx_optimization;
 } TfLiteXNNPackDelegateOptions;
 
 // Returns a structure with the default XNNPack delegate options.
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 3e59dc5493da75..463b64b627e279 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -494,6 +494,24 @@ cc_library(
     copts = tflite_copts(),
 )
 
+cc_library(
+    name = "stablehlo_elementwise",
+    srcs = ["stablehlo_elementwise.cc"],
+    hdrs = [
+        "stablehlo_elementwise.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":kernel_util",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:runtime_shape",
+        "//tensorflow/lite/kernels/internal:tensor_ctypes",
+        "//tensorflow/lite/kernels/internal:types",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
 cc_library(
     name = "rng_util",
     srcs = [
@@ -740,6 +758,8 @@ BUILTIN_KERNEL_SRCS = [
     "select.cc",
     "shape.cc",
     "stablehlo_gather.cc",
+    "stablehlo_add.cc",
+    "stablehlo_multiply.cc",
     "stablehlo_scatter.cc",
     "sign.cc",
     "skip_gram.cc",
@@ -787,6 +807,7 @@ BUILTIN_KERNEL_DEPS = [
     ":lstm_shared",
     ":op_macros",
     ":padding",
+    ":stablehlo_elementwise",
     ":control_flow_common",
     "@eigen_archive//:eigen3",
     "@flatbuffers",
@@ -3096,6 +3117,38 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "stablehlo_add_test",
+    size = "small",
+    srcs = ["stablehlo_add_test.cc"],
+    deps = [
+        ":subgraph_test_util",
+        ":test_util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "stablehlo_multiply_test",
+    size = "small",
+    srcs = ["stablehlo_multiply_test.cc"],
+    deps = [
+        ":subgraph_test_util",
+        ":test_util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "stablehlo_scatter_test",
     size = "small",
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 9a7432a62e2073..829c9de1bb8bb8 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -189,6 +189,8 @@ TfLiteRegistration* Register_DILATE();
 TfLiteRegistration* Register_STABLEHLO_RNG_BIT_GENERATOR();
 TfLiteRegistration* Register_REDUCE_WINDOW();
 TfLiteRegistration* Register_STABLEHLO_GATHER();
+TfLiteRegistration* Register_STABLEHLO_ADD();
+TfLiteRegistration* Register_STABLEHLO_MULTIPLY();
 
 namespace {
 
@@ -542,6 +544,8 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_BITWISE_XOR, Register_BITWISE_XOR());
   AddBuiltin(BuiltinOperator_RIGHT_SHIFT, Register_RIGHT_SHIFT());
   AddBuiltin(BuiltinOperator_STABLEHLO_SCATTER, Register_STABLEHLO_SCATTER());
+  AddBuiltin(BuiltinOperator_STABLEHLO_ADD, Register_STABLEHLO_ADD());
+  AddBuiltin(BuiltinOperator_STABLEHLO_MULTIPLY, Register_STABLEHLO_MULTIPLY());
   AddBuiltin(BuiltinOperator_DILATE, Register_DILATE());
   AddBuiltin(BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
              Register_STABLEHLO_RNG_BIT_GENERATOR());
diff --git a/third_party/xla/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl b/tensorflow/lite/kernels/stablehlo_add.cc
similarity index 57%
rename from third_party/xla/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
rename to tensorflow/lite/kernels/stablehlo_add.cc
index f9a09cf51e5d97..f22db17a4ca7b5 100644
--- a/third_party/xla/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
+++ b/tensorflow/lite/kernels/stablehlo_add.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/stablehlo_elementwise.h"
 
-#define TF_TENSORRT_VERSION "%{tensorrt_version}"
+namespace tflite::ops::builtin {
 
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
+TfLiteRegistration* Register_STABLEHLO_ADD() {
+  static TfLiteRegistration r = {nullptr, nullptr, ElementwisePrepare,
+                                 ElementwiseEval<ComputationType::kAdd>};
+  return &r;
+}
+}  // namespace tflite::ops::builtin
diff --git a/tensorflow/lite/kernels/stablehlo_add_test.cc b/tensorflow/lite/kernels/stablehlo_add_test.cc
new file mode 100644
index 00000000000000..a62203a6033cec
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_add_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class AddOpModel : public SingleOpModel {
+ public:
+  AddOpModel(const TensorData& input1, const TensorData& input2,
+             const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_STABLEHLO_ADD, BuiltinOptions_NONE, 0);
+    SetBypassDefaultDelegates();
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(StablehloElementwise, AddWorks) {
+  AddOpModel model({TensorType_FLOAT32, {1, 2, 2, 1}},
+                   {TensorType_FLOAT32, {1, 2, 2, 1}},
+                   {TensorType_FLOAT32, {}});
+  model.PopulateTensor<float>(model.input1(), {-2.0, 0.2, 0.7, 0.8});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.3, 0.5});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput(), ElementsAre(-1.9, 0.4, 1.0, 1.3));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/stablehlo_elementwise.cc b/tensorflow/lite/kernels/stablehlo_elementwise.cc
new file mode 100644
index 00000000000000..89b2e9b216ea51
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_elementwise.cc
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/stablehlo_elementwise.h"
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+TfLiteStatus ElementwisePrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input_tensor1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input_tensor1));
+  const TfLiteTensor* input_tensor2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input_tensor2));
+
+  // Check the two input tensors have the same type and size.
+  TF_LITE_ENSURE_TYPES_EQ(context, input_tensor1->type, input_tensor2->type);
+  TF_LITE_ENSURE_EQ(context, input_tensor1->dims->size,
+                    input_tensor2->dims->size);
+  for (int idx = 0; idx < input_tensor1->dims->size; ++idx) {
+    TF_LITE_ENSURE_EQ(context, input_tensor1->dims->data[idx],
+                      input_tensor2->dims->data[idx]);
+  }
+
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  // We need the copy since ResizeTensor takes ownership of output_size.
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_tensor1->dims);
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
+
+  return TfLiteStatus::kTfLiteOk;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/stablehlo_elementwise.h b/tensorflow/lite/kernels/stablehlo_elementwise.h
new file mode 100644
index 00000000000000..df0549bdc4e46f
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_elementwise.h
@@ -0,0 +1,150 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Indicates the type of the computation performed by the element-wise op.
+enum class ComputationType { kAdd, kSub, kMax, kMin, kMul };
+
+TfLiteStatus ElementwisePrepare(TfLiteContext* context, TfLiteNode* node);
+
+// A helper function that converts a tensor index into a flat array index.
+template <typename IndexType>
+static IndexType TensorIndexToFlat(const IndexType* index, const int64_t dims,
+                                   const RuntimeShape& shape) {
+  // If it's a scalar, just return the index of the first element.
+  if (dims == 0) {
+    return 0;
+  }
+  IndexType flat_index = index[0];
+  for (int64_t i = 1; i < dims; ++i) {
+    flat_index = flat_index * shape.Dims(i) + index[i];
+  }
+  return flat_index;
+}
+
+template <typename DataType, ComputationType computation_type>
+inline DataType ApplyComputation(DataType input1, DataType input2) {
+  if (computation_type == ComputationType::kAdd) {
+    return input1 + input2;
+  } else if (computation_type == ComputationType::kSub) {
+    return input1 - input2;
+  } else if (computation_type == ComputationType::kMax) {
+    return std::max(input1, input2);
+  } else if (computation_type == ComputationType::kMin) {
+    return std::min(input1, input2);
+  } else if (computation_type == ComputationType::kMul) {
+    return input1 * input2;
+  }
+}
+
+// Evaluates this node given the type of the elements in the output_tensor
+// and the type of the elements in the input/updates vector.
+template <ComputationType computation_type, typename DataType>
+TfLiteStatus EvalWithType(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input_tensor1));
+  RuntimeShape input_shape = GetTensorShape(input_tensor1);
+  const DataType* input_data1 = GetTensorData<DataType>(input_tensor1);
+
+  const TfLiteTensor* input_tensor2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input_tensor2));
+  const DataType* input_data2 = GetTensorData<DataType>(input_tensor2);
+
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  DataType* output_data = GetTensorData<DataType>(output);
+
+  int input_rank = input_tensor1->dims->size;
+  std::vector<int64_t> index(input_rank, 0);
+
+  do {
+    DataType input_value1 =
+        input_data1[TensorIndexToFlat(index.data(), input_rank, input_shape)];
+    DataType input_value2 =
+        input_data2[TensorIndexToFlat(index.data(), input_rank, input_shape)];
+
+    output_data[TensorIndexToFlat(index.data(), input_rank, input_shape)] =
+        ApplyComputation<DataType, computation_type>(input_value1,
+                                                     input_value2);
+  } while (NextIndex(input_rank, input_tensor1->dims->data, index.data()));
+
+  return TfLiteStatus::kTfLiteOk;
+}
+
+template <ComputationType computation_type>
+TfLiteStatus ElementwiseEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input_tensor1));
+
+  TfLiteType data_type = input_tensor1->type;
+
+  switch (data_type) {
+    case kTfLiteFloat16:
+      return EvalWithType<computation_type, Eigen::half>(context, node);
+    case kTfLiteFloat32:
+      return EvalWithType<computation_type, float>(context, node);
+    case kTfLiteFloat64:
+      return EvalWithType<computation_type, double>(context, node);
+    case kTfLiteInt8:
+      return EvalWithType<computation_type, int8_t>(context, node);
+    case kTfLiteInt16:
+      return EvalWithType<computation_type, int16_t>(context, node);
+    case kTfLiteInt32:
+      return EvalWithType<computation_type, int32_t>(context, node);
+    case kTfLiteInt64:
+      return EvalWithType<computation_type, int64_t>(context, node);
+    case kTfLiteUInt8:
+      return EvalWithType<computation_type, uint8_t>(context, node);
+    case kTfLiteUInt16:
+      return EvalWithType<computation_type, uint16_t>(context, node);
+    case kTfLiteUInt32:
+      return EvalWithType<computation_type, uint32_t>(context, node);
+    case kTfLiteUInt64:
+      return EvalWithType<computation_type, uint64_t>(context, node);
+    default:
+      TF_LITE_KERNEL_LOG(context, "(Data Type: %s) currently not supported.\n",
+                         TfLiteTypeGetName(data_type));
+      return TfLiteStatus::kTfLiteError;
+  }
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_STABLEHLO_ELEMENTWISE_H_
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_config.h.tpl b/tensorflow/lite/kernels/stablehlo_multiply.cc
similarity index 57%
rename from third_party/xla/third_party/gpus/rocm/rocm_config.h.tpl
rename to tensorflow/lite/kernels/stablehlo_multiply.cc
index 20506f64b2b9c6..be7abfd872d574 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_config.h.tpl
+++ b/tensorflow/lite/kernels/stablehlo_multiply.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef ROCM_ROCM_CONFIG_H_
-#define ROCM_ROCM_CONFIG_H_
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/stablehlo_elementwise.h"
 
-#define TF_ROCM_TOOLKIT_PATH "%{rocm_toolkit_path}"
+namespace tflite::ops::builtin {
 
-#define TF_ROCM_VERSION %{rocm_version_number}
-#define TF_MIOPEN_VERSION %{miopen_version_number}
-#define TF_HIPRUNTIME_VERSION %{hipruntime_version_number}
-#define TF_HIPBLASLT %{hipblaslt_flag}
-
-#endif  // ROCM_ROCM_CONFIG_H_
+TfLiteRegistration* Register_STABLEHLO_MULTIPLY() {
+  static TfLiteRegistration r = {nullptr, nullptr, ElementwisePrepare,
+                                 ElementwiseEval<ComputationType::kMul>};
+  return &r;
+}
+}  // namespace tflite::ops::builtin
diff --git a/tensorflow/lite/kernels/stablehlo_multiply_test.cc b/tensorflow/lite/kernels/stablehlo_multiply_test.cc
new file mode 100644
index 00000000000000..07aa0d463a053f
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_multiply_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+class MultiplyOpModel : public SingleOpModel {
+ public:
+  MultiplyOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_STABLEHLO_MULTIPLY, BuiltinOptions_NONE, 0);
+    SetBypassDefaultDelegates();
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(StablehloElementwise, MultiplyWorks) {
+  MultiplyOpModel model({TensorType_FLOAT32, {1, 2, 2, 1}},
+                        {TensorType_FLOAT32, {1, 2, 2, 1}},
+                        {TensorType_FLOAT32, {}});
+  model.PopulateTensor<float>(model.input1(), {1.2, 2.5, -1.2, 1});
+  model.PopulateTensor<float>(model.input2(), {0.1, 3, 2, 0.5});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  std::vector<float> expected_values = {0.12, 7.5, -2.4, 0.5};
+  std::vector<float> actual_values = model.GetOutput();
+  ASSERT_EQ(actual_values.size(), expected_values.size());
+  for (int idx = 0; idx < expected_values.size(); ++idx) {
+    ASSERT_NEAR(actual_values[idx], expected_values[idx], 1e-6);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/python/authoring/BUILD b/tensorflow/lite/python/authoring/BUILD
index cf1b46790f5112..b6f13fc4f9a8be 100644
--- a/tensorflow/lite/python/authoring/BUILD
+++ b/tensorflow/lite/python/authoring/BUILD
@@ -2,12 +2,6 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow:internal",
-        "//tensorflow_estimator:__subpackages__",
-        "//tensorflow_federated:__subpackages__",
-        "//third_party/py/tensorflow:__subpackages__",
-    ],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index b0776e530dca4d..0e07563702fcb0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -91,6 +91,8 @@ TfLiteType TfLiteTypeFromPyType(int py_type) {
       return kTfLiteUInt32;
     case NPY_INT16:
       return kTfLiteInt16;
+    case NPY_UINT16:
+      return kTfLiteUInt16;
     case NPY_UINT8:
       return kTfLiteUInt8;
     case NPY_INT8:
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 313ba86e0c7385..30130bc7d90f18 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -427,9 +427,9 @@ enum BuiltinOperator : int32 {
   // All Operators start with STABLEHLO_ prefixes are subject to change
   // Many of the ops below can not be executed by TFlite runtime
   STABLEHLO_LOGISTIC = 162, // WARNING: Do not have runtime support
-  STABLEHLO_ADD = 163, // WARNING: No runtime support yet
+  STABLEHLO_ADD = 163,
   STABLEHLO_DIVIDE = 164, // WARNING: No runtime support yet
-  STABLEHLO_MULTIPLY = 165, // WARNING: No runtime support yet
+  STABLEHLO_MULTIPLY = 165,
   STABLEHLO_MAXIMUM = 166, // WARNING: No runtime support yet
   STABLEHLO_RESHAPE = 167, // WARNING: No runtime support yet
   STABLEHLO_CLAMP = 168, // WARNING: No runtime support
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index cb40b5648d402a..a6b36451cb819b 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG bbbaa7352a3ea729987d3e654d37be93e8009691
+  GIT_TAG c7e7cde37615a81a529c326aa278bfab4cd6fe5a
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/tflite-android.Dockerfile b/tensorflow/lite/tools/tflite-android.Dockerfile
index aca45fc73cf0c3..d1981b0224d2d4 100644
--- a/tensorflow/lite/tools/tflite-android.Dockerfile
+++ b/tensorflow/lite/tools/tflite-android.Dockerfile
@@ -23,7 +23,7 @@ RUN cd ${ANDROID_DEV_HOME} && \
     rm ${ANDROID_SDK_FILENAME}
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r25b-linux.zip
+ENV ANDROID_NDK_FILENAME android-ndk-r21e-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 1878709d05bf96..fba502b08a5821 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -434,7 +434,9 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_DILATE, 1}, "2.15.0"},
            {{BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR, 1}, "2.15.0"},
            {{BuiltinOperator_REDUCE_WINDOW, 1}, "2.15.0"},
-           {{BuiltinOperator_STABLEHLO_GATHER, 1}, "2.16.0"}});
+           {{BuiltinOperator_STABLEHLO_GATHER, 1}, "2.16.0"},
+           {{BuiltinOperator_STABLEHLO_ADD, 1}, "2.16.0"},
+           {{BuiltinOperator_STABLEHLO_MULTIPLY, 1}, "2.16.0"}});
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
   auto it = op_version_map->find(version_key);
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 210787452b9752..e3a27909cac23f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -189,7 +189,9 @@ py_strict_library(
         "//tensorflow/python/grappler:tf_cluster",
         "//tensorflow/python/grappler:tf_item",
         "//tensorflow/python/grappler:tf_optimizer",
-        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/lib/io:file_io",
+        "//tensorflow/python/lib/io:python_io",
+        "//tensorflow/python/lib/io:tf_record",
         "//tensorflow/python/module",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:array_ops_stack",
@@ -386,7 +388,9 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_combinations_lib",
         "//tensorflow/python/framework:versions",
-        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/lib/io:file_io",
+        "//tensorflow/python/lib/io:python_io",
+        "//tensorflow/python/lib/io:tf_record",
         "//tensorflow/python/module",
         "//tensorflow/python/ops:audio_ops_gen",
         "//tensorflow/python/ops:bincount_ops",
diff --git a/tensorflow/python/client/_pywrap_tf_session.pyi b/tensorflow/python/client/_pywrap_tf_session.pyi
index a453092aeee274..62ea3ff5546c9e 100644
--- a/tensorflow/python/client/_pywrap_tf_session.pyi
+++ b/tensorflow/python/client/_pywrap_tf_session.pyi
@@ -95,7 +95,7 @@ class OpsById:
     def __contains__(self, arg0: object) -> bool: ...
     def __delitem__(self, arg0: int) -> None: ...
     def __getitem__(self, arg0: int) -> object: ...
-    def __iter__(self) -> Iterator: ...
+    def __iter__(self) -> Iterator[int]: ...
     def __len__(self) -> int: ...
     def __setitem__(self, arg0: int, arg1: object) -> None: ...
 
@@ -111,7 +111,7 @@ class OpsByName:
     def __contains__(self, arg0: object) -> bool: ...
     def __delitem__(self, arg0: str) -> None: ...
     def __getitem__(self, arg0: str) -> object: ...
-    def __iter__(self) -> Iterator: ...
+    def __iter__(self) -> Iterator[str]: ...
     def __len__(self) -> int: ...
     def __setitem__(self, arg0: str, arg1: object) -> None: ...
 
@@ -376,6 +376,7 @@ def TF_DeviceListName(arg0: TF_DeviceList, arg1: int) -> str: ...
 def TF_DeviceListType(arg0: TF_DeviceList, arg1: int) -> str: ...
 def TF_FinishOperation(arg0: TF_OperationDescription) -> TF_Operation: ...
 def TF_FunctionImportFunctionDef(arg0: bytes) -> TF_Function: ...
+def TF_FunctionImportFunctionDefNoSerialization(arg0) -> TF_Function: ...
 def TF_FunctionSetAttrValueProto(arg0: TF_Function, arg1: str, arg2: bytes) -> None: ...
 def TF_FunctionToFunctionDef(arg0: TF_Function, arg1: TF_Buffer) -> None: ...
 def TF_GetAllOpList() -> TF_Buffer: ...
@@ -388,6 +389,7 @@ def TF_GetXlaAutoJitEnabled() -> int: ...
 def TF_GetXlaConstantFoldingDisabled() -> int: ...
 def TF_GraphCopyFunction(arg0: PyGraph, arg1: TF_Function, arg2: TF_Function) -> None: ...
 def TF_GraphImportGraphDefWithResults(arg0: PyGraph, arg1: TF_Buffer, arg2: TF_ImportGraphDefOptions) -> TF_ImportGraphDefResults: ...
+def TF_GraphImportGraphDefWithResultsNoSerialization(arg0: PyGraph, arg1, arg2: TF_ImportGraphDefOptions) -> TF_ImportGraphDefResults: ...
 def TF_GraphNextOperation(arg0: PyGraph, arg1: int) -> tuple: ...
 def TF_GraphRemoveFunction(arg0: PyGraph, arg1: str) -> None: ...
 def TF_GraphSetOutputHandleShapesAndTypes_wrapper(arg0: PyGraph, arg1: TF_Output, arg2: list[Optional[list[int]]], arg3: list[int], arg4: object) -> None: ...
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 6bbe4d6726089b..790629c96d2e4f 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/python_api.h"
 #include "tensorflow/c/safe_ptr.h"
+#include "tensorflow/c/tf_buffer.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/full_type.pb.h"
@@ -1708,6 +1709,25 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
+  m.def(
+      "TF_GraphImportGraphDefWithResultsNoSerialization",
+      [](PyGraph* graph, const tensorflow::GraphDef* graph_def,
+         const TF_ImportGraphDefOptions* options) {
+        tensorflow::Safe_TF_StatusPtr status =
+            tensorflow::make_safe(TF_NewStatus());
+        TF_ImportGraphDefResults* output;
+        {
+          TF_Buffer graph_def_buffer;
+          graph_def_buffer.data = reinterpret_cast<const void*>(graph_def);
+          graph_def_buffer.length = sizeof(tensorflow::GraphDef*);
+          output = TF_GraphImportGraphDefWithResultsNoSerialization(
+              graph->tf_graph(), &graph_def_buffer, options, status.get());
+        }
+        tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        return output;
+      },
+      py::return_value_policy::reference);
+
   m.def(
       "TF_GraphNextOperation",
       [](PyGraph* graph, size_t pos) {
@@ -1848,6 +1868,25 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
+  m.def(
+      "TF_FunctionImportFunctionDefNoSerialization",
+      [](tensorflow::FunctionDef fdef) {
+        tensorflow::Safe_TF_StatusPtr status =
+            tensorflow::make_safe(TF_NewStatus());
+
+        // Release GIL.
+        py::gil_scoped_release release;
+        TF_Function* func = new TF_Function();
+        func->record =
+            new tensorflow::FunctionRecord(std::move(fdef), {}, false);
+        status.get()->status = ::tensorflow::OkStatus();
+        // Acquire GIL for returning output returning.
+        pybind11::gil_scoped_acquire acquire;
+        tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        return func;
+      },
+      py::return_value_policy::reference);
+
   m.def("EqualAttrValueWrapper", tensorflow::EqualAttrValueWrapper,
         py::call_guard<py::gil_scoped_release>());
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 323ff2fc0f436b..5764d4d7285016 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 10, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index 836654c5a8c337..03f795a9212d84 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -55,7 +55,8 @@ def increment_and_square(x):
   def reduce_fn(x, y):
     name, functions = y
     return x + combinations.combine(
-        functions=combinations.NamedObject(name, functions))
+        functions=combinations.NamedObject(name, functions)
+    )
 
   return functools.reduce(reduce_fn, cases, [])
 
@@ -63,13 +64,36 @@ def reduce_fn(x, y):
 class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _test_combinations()))
-  def testMapFusion(self, functions):
-    dataset = dataset_ops.Dataset.range(5).apply(
-        testing.assert_next(["Map", "MemoryCacheImpl"]))
+      combinations.times(
+          test_base.default_test_combinations(),
+          _test_combinations(),
+          combinations.combine(
+              num_parallel_calls=[None, 2, dataset_ops.AUTOTUNE]
+          ),
+          combinations.combine(deterministic=[None, True, False]),
+      )
+  )
+  def testMapFusion(self, functions, num_parallel_calls, deterministic):
+    dataset = dataset_ops.Dataset.range(5)
+    if num_parallel_calls is None:
+      dataset = dataset.apply(testing.assert_next(["Map", "MemoryCacheImpl"]))
+    elif num_parallel_calls in [dataset_ops.AUTOTUNE]:
+      # TODO(b/148614504): Support fusion of parallel maps with
+      # non-AUTOTUNE value.
+      dataset = dataset.apply(
+          testing.assert_next(["ParallelMap", "MemoryCacheImpl"])
+      )
+    else:
+      dataset = dataset.apply(
+          testing.assert_next(["ParallelMap", "ParallelMap"])
+      )
+
     for function in functions:
-      dataset = dataset.map(function)
+      dataset = dataset.map(
+          function,
+          num_parallel_calls=num_parallel_calls,
+          deterministic=deterministic,
+      )
 
     dataset = dataset.cache()
     options = options_lib.Options()
@@ -85,18 +109,36 @@ def testMapFusion(self, functions):
         else:
           r = function(r)
       expected_output.append(r)
-    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testCapturedInputs(self):
+    if num_parallel_calls is None or deterministic in [None, True]:
+      self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              num_parallel_calls=[None, 2, dataset_ops.AUTOTUNE]
+          ),
+      )
+  )
+  def testCapturedInputs(self, num_parallel_calls):
     a = constant_op.constant(3, dtype=dtypes.int64)
     b = constant_op.constant(4, dtype=dtypes.int64)
     some_tensor = math_ops.mul(a, b)
 
+    dataset = dataset_ops.Dataset.range(1)
     # We currently do not support functions with captured inputs.
-    dataset = dataset_ops.Dataset.range(1).apply(
-        testing.assert_next(["Map", "Map"
-                            ])).map(lambda x: some_tensor).map(lambda x: x)
+    if num_parallel_calls in [2, dataset_ops.AUTOTUNE]:
+      dataset = dataset.apply(
+          testing.assert_next(["ParallelMap", "ParallelMap"])
+      )
+    else:
+      dataset = dataset.apply(testing.assert_next(["Map", "Map"]))
+
+    dataset = dataset.map(
+        lambda x: some_tensor, num_parallel_calls=num_parallel_calls
+    ).map(lambda x: x, num_parallel_calls=num_parallel_calls)
+
     options = options_lib.Options()
     options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_fusion = True
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 53f6caa6f61ca3..9c2b3443a4108b 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -369,7 +369,6 @@ def __init__(self,
             tpu_cluster_resolver,
             device_assignment=experimental_device_assignment,
             use_spmd_for_xla_partitioning=experimental_spmd_xla_partitioning,
-            enable_data_reorder=experimental_device_assignment is not None,
         )
     )
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
@@ -710,7 +709,6 @@ def __init__(self,
             self,
             tpu_cluster_resolver,
             device_assignment=device_assignment,
-            enable_data_reorder=device_assignment is not None,
         )
     )
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
@@ -864,7 +862,6 @@ def __init__(
       steps_per_run=None,
       device_assignment=None,
       use_spmd_for_xla_partitioning=False,
-      enable_data_reorder=False,
   ):
     super().__init__(container_strategy)
 
@@ -926,15 +923,6 @@ def __init__(
       self._host_input_worker_devices.setdefault(host_device, [])
       self._host_input_worker_devices[host_device].append(host_device)
 
-    # Create the replica order based on the assigned device order.
-    # This replica order will be used to match the IteratorGetNext ops
-    # with the device assigment.
-    self._replica_order = (
-        self._get_replica_order(self._tpu_devices[:, 0])
-        if enable_data_reorder
-        else None
-    )
-
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
@@ -965,7 +953,11 @@ def __init__(
         self._using_custom_device = True
         break
 
-  def _get_replica_order(self, tpu_devices):
+    # This is a flag to enable data reorder which is used
+    # to match IteratorGetNext's device with the TPUExecute device.
+    self._enable_data_reorder = False
+
+  def _get_replica_order(self):
     """Get the replica order based on the tpu device order.
 
     For example, if the tpu_devices are:
@@ -985,13 +977,14 @@ def _get_replica_order(self, tpu_devices):
     iterators,
     so that they can be placed on the same node as their computation graphs.
 
-    Args:
-      tpu_devices (List[str]): A list of tpu device names in the order of
-        replicas.
-
     Returns:
       A list containing the order ids of corresponding TPU devices.
     """
+    if not self._enable_data_reorder:
+      return None
+
+    tpu_devices = self._tpu_devices[:, 0]
+
     devices_with_ids = []
     for i, tpu_device in enumerate(tpu_devices):
       spec = tf_device.DeviceSpec.from_string(tpu_device)
@@ -1083,7 +1076,7 @@ def _experimental_distribute_dataset(self, dataset, options):
         self._container_strategy(),
         num_replicas_in_sync=self._num_replicas_in_sync,
         options=options,
-        replica_order=self._replica_order,
+        replica_order=self._get_replica_order(),
     )
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
@@ -1109,7 +1102,7 @@ def _distribute_datasets_from_function(self, dataset_fn, options):
         input_contexts,
         self._container_strategy(),
         options=options,
-        replica_order=self._replica_order,
+        replica_order=self._get_replica_order(),
     )
 
     # We can only check after the dataset_fn is called.
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index a3fc8ce791641a..131c1db9ac4776 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -1208,6 +1208,7 @@ def _test_replica_order(self, create_dist_dataset_fn):
     strategy = tpu_lib.TPUStrategyV2(
         resolver, experimental_device_assignment=device_assignment
     )
+    strategy.extended._enable_data_reorder = True
 
     dist_dataset = create_dist_dataset_fn(strategy)
     iterator = iter(dist_dataset)
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index 9d93e3650b6878..69ca5b0c45a23a 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -203,6 +203,8 @@ def test_func(x):
     result = test_func.experimental_get_compiler_ir(a)(stage=stage)
     self.assertNotEmpty(result)
     if stage == 'optimized_hlo_proto_serialized':
+      if test.is_built_with_rocm():
+        self.skipTest('Currently failing on ROCm due to mismatch')
       hlo_proto = hlo_pb2.HloProto.FromString(result)
       allocations = hlo_proto.buffer_assignment.buffer_allocations
       buffer_size = sum(
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index facd626e001fb9..70f6e0e90877b5 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -24,7 +24,6 @@
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util
@@ -440,20 +439,14 @@ def f(x):
       return math_ops.reduce_prod(
           pointwise + math_ops.reduce_sum(pointwise), axis=1)
 
-    if (context.run_eager_op_as_function_enabled() and
-        test_util.is_xla_enabled()):
-      # Autoclustering kicks in when eager_op_as_function is enabled.
-      # Under XLA the symbolic tolerances are less than under TF.
-      # Ref: b/202559426
-      _test_gradients(
-          self,
-          f, [constant_op.constant([[2.0, 3.0], [1.0, 4.0]])],
-          order=3,
-          srtol=1e-6,
-          satol=1e-3)
-    else:
-      _test_gradients(
-          self, f, [constant_op.constant([[2.0, 3.0], [1.0, 4.0]])], order=3)
+    _test_gradients(
+        self,
+        f,
+        [constant_op.constant([[2.0, 3.0], [1.0, 4.0]])],
+        order=3,
+        srtol=1e-6,
+        satol=1e-3,
+    )
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testNumericHigherOrderFloat64(self):
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 5cafd551d093ed..33d142da7b794c 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -532,9 +532,16 @@ py_strict_library(
     ],
 )
 
+tf_cc_binary(
+    name = "_native_proto_caster",
+    linkshared = True,
+    deps = ["@pybind11_protobuf//pybind11_protobuf:native_proto_caster"],
+)
+
 py_strict_library(
     name = "function",
     srcs = ["function.py"],
+    data = [":_native_proto_caster"],  # copybara:comment
     srcs_version = "PY3",
     visibility = visibility + [
         "//smartass/brain:__subpackages__",
@@ -558,6 +565,7 @@ py_strict_library(
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:function_utils",
         "//tensorflow/python/util:tf_decorator",
+        # copybara:uncomment "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
 
@@ -1929,7 +1937,7 @@ pytype_strict_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "@pypi_packaging//:pkg",
@@ -1963,7 +1971,7 @@ pytype_strict_library(
     deps = [
         ":byte_swap_tensor",
         ":ops",
-        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 068a685184245a..848a4c8f23599f 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -38,6 +38,8 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
+is_oss = True  # updated by copybara
+
 
 # TODO(b/136040013): Drop support for Defun.
 class Defun(object):
@@ -1161,8 +1163,11 @@ def _from_definition(fdef, grad_func=None):
   result = _DefinedFunction(func, argnames, input_types, func_name, grad_func,
                             python_grad_func, out_names)
   # pylint: disable=protected-access
-  serialized = fdef.SerializeToString()
-  c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+  if is_oss:
+    serialized = fdef.SerializeToString()
+    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+  else:
+    c_func = c_api.TF_FunctionImportFunctionDefNoSerialization(fdef)
   result._c_func = c_api_util.ScopedTFFunction(c_func, func_name)
   result._extra_inputs = []
   result._op_def = fdef.signature
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 359e9f4f99af08..d49c62d94d040d 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -31,6 +31,10 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
+# TODO(b/307794935): Remove after bug is fixed.
+is_oss = True  # Updated by copybara.
+
+
 def _IsControlInput(input_name):
   # Expected format: '^operation_name' (control input).
   return input_name.startswith('^')
@@ -505,15 +509,26 @@ def _import_graph_def_internal(  # pylint: disable=invalid-name
   # TF_GraphImportGraphDefWithResults call and mutating the them in
   # _ProcessNewOps.
   with graph._mutation_lock():  # pylint: disable=protected-access
-    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-      try:
-        with graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
-          results = c_api.TF_GraphImportGraphDefWithResults(
-              c_graph, serialized, options)
-        results = c_api_util.ScopedTFImportGraphDefResults(results)
-      except errors.InvalidArgumentError as e:
-        # Convert to ValueError for backwards compatibility.
-        raise ValueError(str(e))
+    if is_oss:
+      graph_def_input = c_api.TF_NewBufferFromString(
+          compat.as_bytes(graph_def.SerializeToString())
+      )
+      graph_import_graphdef = c_api.TF_GraphImportGraphDefWithResults
+    else:
+      graph_def_input = graph_def
+      graph_import_graphdef = (
+          c_api.TF_GraphImportGraphDefWithResultsNoSerialization
+      )
+    try:
+      with graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
+        results = graph_import_graphdef(c_graph, graph_def_input, options)
+      results = c_api_util.ScopedTFImportGraphDefResults(results)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+    finally:
+      if is_oss:
+        c_api.TF_DeleteBuffer(graph_def_input)
 
     # Create _DefinedFunctions for any imported functions.
     #
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 622208c996fa71..c736d5505f8163 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -81,6 +81,9 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
+# TODO(b/307794935): Remove after bug is fixed.
+is_oss = True  # Updated by copybara
+
 # Temporary global switches determining if we should enable the work-in-progress
 # calls to the C API. These will be removed once all functionality is supported.
 _USE_C_API: bool = True
@@ -2406,7 +2409,9 @@ def _as_graph_def(
 
     return graph, self.version
 
-  def as_graph_def(self, from_version=None, add_shapes=False):
+  def as_graph_def(
+      self, from_version=None, add_shapes=False, use_pybind11_proto=False
+  ):
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
@@ -2422,6 +2427,9 @@ def as_graph_def(self, from_version=None, add_shapes=False):
         property had the given value.
       add_shapes: If true, adds an "_output_shapes" list attr to each node with
         the inferred shapes of each of its outputs.
+      use_pybind11_proto: If true, If true, uses the c++ pybind11_proto api to
+        get the GraphDef proto directly from c++, instead of through a TF
+        buffer. See https://github.com/pybind/pybind11_protobuf for reference.
 
     Returns:
       A
@@ -2432,7 +2440,11 @@ def as_graph_def(self, from_version=None, add_shapes=False):
       ValueError: If the `graph_def` would be too large.
     """
     # pylint: enable=line-too-long
-    result, _ = self._as_graph_def(from_version, add_shapes)
+    if is_oss:
+      use_pybind11_proto = False
+    result, _ = self._as_graph_def(
+        from_version, add_shapes, use_pybind11_proto=use_pybind11_proto
+    )
     return result
 
   def _is_function(self, name):
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index 6a8317ae594877..c9b4984d90801e 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -34,23 +34,6 @@ tf_python_pybind_extension(
     ],
 )
 
-py_strict_library(
-    name = "lib",
-    deprecation = "This target has been split. Depend on the sub-targets instead.",
-    srcs_version = "PY3",
-    visibility = visibility + [
-        "//tensorflow:internal",
-        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-        "//third_party/py/tf_agents:__subpackages__",
-        "//third_party/py/tf_slim:__subpackages__",
-    ],
-    deps = [
-        ":file_io",
-        ":python_io",
-        ":tf_record",
-    ],
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_record_io",
     srcs = ["record_io_wrapper.cc"],
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 8ac8c923687d57..64d719f3b37e45 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1631,11 +1631,13 @@ py_strict_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":lookup_grad",
         ":manip_grad",
         ":math_grad",
         ":math_ops",
         ":nccl_ops",
         ":optional_grad",
+        ":parsing_grad",
         ":proto_ops",
         ":random_grad",
         ":rnn_grad",
@@ -1963,6 +1965,7 @@ py_strict_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
+        ":lookup_grad",
         ":lookup_ops_gen",
         ":math_ops",
         ":string_ops",
@@ -1987,6 +1990,14 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "lookup_grad",
+    srcs = ["lookup_grad.py"],
+    deps = [
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
 tf_py_strict_test(
     name = "lookup_ops_async_checkpoint_test",
     srcs = ["lookup_ops_async_checkpoint_test.py"],
@@ -2308,6 +2319,7 @@ py_strict_library(
         ":control_flow_ops",
         ":math_ops",
         ":parsing_config",
+        ":parsing_grad",
         ":parsing_ops_gen",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
@@ -2317,6 +2329,12 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "parsing_grad",
+    srcs = ["parsing_grad.py"],
+    deps = ["//tensorflow/python/framework:ops"],
+)
+
 py_strict_library(
     name = "partitioned_variables",
     srcs = ["partitioned_variables.py"],
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index b7d2e774235474..ae88a6d6306831 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -27,11 +27,13 @@
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import lookup_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import parsing_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import proto_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import rnn_grad  # pylint: disable=unused-import
diff --git a/tensorflow/python/ops/lookup_grad.py b/tensorflow/python/ops/lookup_grad.py
new file mode 100644
index 00000000000000..3ae89b659aa153
--- /dev/null
+++ b/tensorflow/python/ops/lookup_grad.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for lookup operations."""
+
+from tensorflow.python.framework import ops
+
+
+ops.NotDifferentiable("LookupTableFind")
+ops.NotDifferentiable("LookupTableFindV2")
+ops.NotDifferentiable("LookupTableInsert")
+ops.NotDifferentiable("LookupTableInsertV2")
+ops.NotDifferentiable("LookupTableSize")
+ops.NotDifferentiable("LookupTableSizeV2")
+ops.NotDifferentiable("HashTable")
+ops.NotDifferentiable("HashTableV2")
+ops.NotDifferentiable("InitializeTable")
+ops.NotDifferentiable("InitializeTableV2")
+ops.NotDifferentiable("InitializeTableFromTextFile")
+ops.NotDifferentiable("InitializeTableFromTextFileV2")
+ops.NotDifferentiable("MutableDenseHashTable")
+ops.NotDifferentiable("MutableDenseHashTableV2")
+ops.NotDifferentiable("MutableHashTable")
+ops.NotDifferentiable("MutableHashTableV2")
+ops.NotDifferentiable("MutableHashTableOfTensors")
+ops.NotDifferentiable("MutableHashTableOfTensorsV2")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 1f3ed12756346d..9731fff62898dc 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -30,6 +30,8 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_lookup_ops
+# Ensure lookup gradients are registered
+from tensorflow.python.ops import lookup_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
@@ -2458,23 +2460,3 @@ def restore(self, restored_tensors, restored_shapes):
           return gen_lookup_ops.lookup_table_import_v2(self.op.resource_handle,
                                                        restored_tensors[0],
                                                        restored_tensors[1])
-
-
-ops.NotDifferentiable("LookupTableFind")
-ops.NotDifferentiable("LookupTableFindV2")
-ops.NotDifferentiable("LookupTableInsert")
-ops.NotDifferentiable("LookupTableInsertV2")
-ops.NotDifferentiable("LookupTableSize")
-ops.NotDifferentiable("LookupTableSizeV2")
-ops.NotDifferentiable("HashTable")
-ops.NotDifferentiable("HashTableV2")
-ops.NotDifferentiable("InitializeTable")
-ops.NotDifferentiable("InitializeTableV2")
-ops.NotDifferentiable("InitializeTableFromTextFile")
-ops.NotDifferentiable("InitializeTableFromTextFileV2")
-ops.NotDifferentiable("MutableDenseHashTable")
-ops.NotDifferentiable("MutableDenseHashTableV2")
-ops.NotDifferentiable("MutableHashTable")
-ops.NotDifferentiable("MutableHashTableV2")
-ops.NotDifferentiable("MutableHashTableOfTensors")
-ops.NotDifferentiable("MutableHashTableOfTensorsV2")
diff --git a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
index c59009fb809e90..e8200615f92969 100644
--- a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
+++ b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
@@ -112,6 +112,8 @@ def run(test_func):
 
   @test_util.run_v2_only
   def testRecomputeGradXla(self):
+    if test.is_built_with_rocm():
+      self.skipTest('Currently failing on ROCm due to mismatch')
     device_type = self._get_device_type()
     device_name = f"{device_type}:0"
     # Necessary for TFRT tests.
diff --git a/tensorflow/core/tfrt/saved_model/python/_pywrap_saved_model_aot_compile.pyi b/tensorflow/python/ops/parsing_grad.py
similarity index 70%
rename from tensorflow/core/tfrt/saved_model/python/_pywrap_saved_model_aot_compile.pyi
rename to tensorflow/python/ops/parsing_grad.py
index 05aae4b22ab694..bcfb2a0cba3344 100644
--- a/tensorflow/core/tfrt/saved_model/python/_pywrap_saved_model_aot_compile.pyi
+++ b/tensorflow/python/ops/parsing_grad.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-class AotOptions:
-    def __init__(self) -> None: ...
+"""Gradient registrations for parsing ops."""
+from tensorflow.python.framework import ops
 
-def AotCompileSavedModel(input_model_dir: str = ..., aot_options: AotOptions = ..., output_model_dir: str = ...) -> None: ...
+
+ops.NotDifferentiable("DecodeRaw")
+ops.NotDifferentiable("DecodePaddedRaw")
+ops.NotDifferentiable("ParseTensor")
+ops.NotDifferentiable("SerializeTensor")
+ops.NotDifferentiable("StringToNumber")
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 66b2ea3576c18f..507bb338940977 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -22,6 +22,8 @@
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_config
+# Ensure parsing_ops gradients are registered
+from tensorflow.python.ops import parsing_grad  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
 from tensorflow.python.ops.gen_parsing_ops import *
@@ -30,14 +32,6 @@
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-
-ops.NotDifferentiable("DecodeRaw")
-ops.NotDifferentiable("DecodePaddedRaw")
-ops.NotDifferentiable("ParseTensor")
-ops.NotDifferentiable("SerializeTensor")
-ops.NotDifferentiable("StringToNumber")
-
-
 VarLenFeature = parsing_config.VarLenFeature
 RaggedFeature = parsing_config.RaggedFeature
 SparseFeature = parsing_config.SparseFeature
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index d99b7e114aa372..5208dd1c8229ae 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -352,7 +352,7 @@ def __init__(self,
         variable and return the Tensor for the projected value (which must have
         the same shape). Constraints are not safe to use when doing asynchronous
         distributed training.
-      synchronization: Indicates when a distributed a variable will be
+      synchronization: Indicates when a distributed variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses when to
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index 48778b626a949c..915e20d3bb0f41 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -91,7 +91,6 @@ py_strict_library(
         "@absl_py//absl:app",
         "@absl_py//absl/testing:absltest",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_inspect",
     ]),
@@ -348,10 +347,7 @@ py_strict_library(
 tf_py_strict_test(
     name = "build_info_test",
     size = "small",
-    srcs = [
-        "build_info.py",
-        "build_info_test.py",
-    ],
+    srcs = ["build_info_test.py"],
     main = "build_info_test.py",
     python_version = "PY3",
     tags = [
diff --git a/tensorflow/python/pywrap_dtensor_device.cc b/tensorflow/python/pywrap_dtensor_device.cc
index 4330c0170e92cb..8cd5fe8b5014aa 100644
--- a/tensorflow/python/pywrap_dtensor_device.cc
+++ b/tensorflow/python/pywrap_dtensor_device.cc
@@ -563,5 +563,21 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
             return layout.num_shards_for_dim(dim);
           },
           py::arg("idx"),
-          "Returns the number of shards for tensor dimension `idx`.");
+          "Returns the number of shards for tensor dimension `idx`.")
+      .def(
+          "global_shape_from_local_shape",
+          [](const Layout& layout, std::vector<int64_t> local_shape) {
+            return py::tuple(
+                py::cast(layout.GlobalShapeFromLocalShape(local_shape)));
+          },
+          py::arg("local_shape"),
+          "Returns the global shape computed from this local shape.")
+      .def(
+          "local_shape_from_global_shape",
+          [](const Layout& layout, std::vector<int64_t> global_shape) {
+            return py::tuple(
+                py::cast(layout.LocalShapeFromGlobalShape(global_shape)));
+          },
+          py::arg("global_shape"),
+          "Returns the local shape computed from this global shape.");
 }
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 7d1e9c1a486ba8..c1d7e4617b42db 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -84,6 +84,7 @@ py_strict_library(
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
+        # copybara:uncomment "//tensorflow/tools/proto_splitter/python:saved_model",
     ],
 )
 
@@ -390,11 +391,11 @@ py_strict_library(
         "//tensorflow/python/eager/polymorphic_function:concrete_function",
         "//tensorflow/python/eager/polymorphic_function:saved_model_exported_concrete",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:error_interpolation",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:meta_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:versions",
@@ -453,9 +454,9 @@ tf_py_strict_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:meta_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
@@ -853,7 +854,6 @@ tf_py_strict_test(
         ":method_name_updater",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/framework",
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 18bdc53a3d77e8..ee83e8c5eb19d3 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -18,7 +18,6 @@
 import os
 
 from google.protobuf.any_pb2 import Any
-
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
@@ -38,7 +37,7 @@
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
-
+# Placeholder for protosplitter import.  # copybara:comment
 # API label for SavedModel metrics.
 _SAVE_BUILDER_LABEL = "save_v1_builder"
 
@@ -397,19 +396,29 @@ def add_meta_graph_and_variables(self,
     # subsequent attempts to save variables will fail.
     self._has_saved_variables = True
 
-  def save(self, as_text=False):
+  def save(self, as_text=False, experimental_image_format=False):
     """Writes a `SavedModel` protocol buffer to disk.
 
     The function writes the SavedModel protocol buffer to the export directory
     in a serialized format.
 
     Args:
-      as_text: Writes the SavedModel protocol buffer in text format to
-        disk. Protocol buffers in text format are useful for debugging, but
-        parsing fails when it encounters an unknown field and so is not forward
+      as_text: Writes the SavedModel protocol buffer in text format to disk.
+        Protocol buffers in text format are useful for debugging, but parsing
+        fails when it encounters an unknown field and so is not forward
         compatible. This means changes to TensorFlow may prevent deployment of
         new text format SavedModels to existing serving binaries. Do not deploy
         `as_text` SavedModels to production.
+      experimental_image_format: Writes the SavedModel protobuf in the
+        experimental image format. See
+      https://www.tensorflow.org/api_docs/python/tf/saved_model/SaveOptions for
+        more details. This allows `SavedModelBuilder` to save models larger than
+        2 GiB.
+    
+    Raises:
+       RuntimeError: When trying to use `proto_splitter` but `proto_splitter` is
+         not imported. This check is here because `proto_splitter` is not 
+         available in OSS at the moment. 
 
     Returns:
       The path to which the SavedModel protocol buffer was written.
@@ -424,11 +433,28 @@ def save(self, as_text=False):
           compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
       file_io.write_string_to_file(path, str(self._saved_model))
     else:
-      path = file_io.join(
-          compat.as_bytes(self._export_dir),
-          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
-      file_io.write_string_to_file(
-          path, self._saved_model.SerializeToString(deterministic=True))
+      if experimental_image_format:
+        path = file_io.join(
+            self._export_dir,
+            constants.SAVED_MODEL_FILENAME_PREFIX,
+        )
+        if (
+            locals().get("proto_splitter", globals().get("proto_splitter"))
+            is None
+        ):
+          raise RuntimeError(
+              "No proto_splitter is provided, cannot use"
+              " experimental_image_format."
+          )
+        proto_splitter.SavedModelSplitter(self._saved_model).write(path)
+      else:
+        path = file_io.join(
+            compat.as_bytes(self._export_dir),
+            compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB),
+        )
+        file_io.write_string_to_file(
+            path, self._saved_model.SerializeToString(deterministic=True)
+        )
       # Placeholder for internal TF1 model fingerprint write
     tf_logging.info("SavedModel written to: %s", compat.as_text(path))
     metrics.IncrementWrite(write_version="1")
diff --git a/tensorflow/python/tpu/ops/BUILD b/tensorflow/python/tpu/ops/BUILD
index dd61c8c9ef4252..bc70f4c2a3ba56 100644
--- a/tensorflow/python/tpu/ops/BUILD
+++ b/tensorflow/python/tpu/ops/BUILD
@@ -44,10 +44,15 @@ tf_gen_op_wrapper_py(
 tf_gen_op_wrapper_py(
     name = "gen_xla_ops",
     out = "gen_xla_ops.py",
+    api_def_srcs = [
+        "//tensorflow/core/api_def:base_api_def",
+        "//tensorflow/core/api_def:python_api_def",
+    ],
     op_allowlist = [
         "ConvertToCooTensor",
         "GetMinibatchesInCsrWithPhysicalReplica",
         "GetMinibatchSplitsWithPhysicalReplica",
+        "StoreMinibatchStatisticsInFdo",
         "GlobalIterId",
         "TPUCopyWithDynamicShape",
         "TPUAnnotateTensorsWithDynamicShape",
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index c886e3a82488e5..0ea650b336c630 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -1277,11 +1277,6 @@ def __init__(self,
       raise ValueError(
           f"Argument `max_sequence_length` must be an int and must be >= 0. "
           f"Received: {max_sequence_length}")
-    if name is None:
-      logging.warning(
-          "Name of the Feature config must be specified for running on"
-          " SparseCore. Different feature configs must have unique names."
-      )
 
     self.table = table
     self.max_sequence_length = max_sequence_length
diff --git a/tensorflow/python/tpu/tpu_embedding_v3.py b/tensorflow/python/tpu/tpu_embedding_v3.py
index 3d4966f75ad1be..ca20bfe581a620 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3.py
@@ -530,13 +530,28 @@ def variable_creator(name, initializer):
           trainable=False,
       )
 
-    parameters = variable_creator(stacked_table_name, table_initialize_fn)
+    with variable_scope.variable_creator_scope(
+        make_sharded_variable_creator(self._strategy, shape_is_local=False)
+    ):
+      parameters = variable_creator(stacked_table_name, table_initialize_fn)
 
     def slot_creator(name, initializer):
       return variable_creator(stacked_table_name + "/" + name, initializer)
 
     if optimizer is not None:
-      slot_vars = optimizer._create_slots(parameters, slot_creator)  # pylint: disable=protected-access
+      # FIXME(b/305882915): tensorflow_recommender calls into keras legacy
+      # optimizer, which creates the slot variable with
+      # TPUShardedEmbeddingVariable.shape as shape, but that shape attribute
+      # returns a local shape. We shall change the shape attribute to
+      # return a global shape, but such a change will break users who already
+      # depend on the attribute being local.
+      shape_is_local = optimizer.slot_variable_creation_fn is not None
+      with variable_scope.variable_creator_scope(
+          make_sharded_variable_creator(
+              self._strategy, shape_is_local=shape_is_local
+          )
+      ):
+        slot_vars = optimizer._create_slots(parameters, slot_creator)  # pylint: disable=protected-access
     else:
       slot_vars = {}
     slot_vars["parameters"] = parameters
@@ -629,11 +644,11 @@ def _stack_tables_with_same_table_dim_and_optimizer(self):
     self._table_to_sample_count = {
         table_name: 0 for table_name in self._stacked_table_to_tables
     }
-    for _, feature in self._flat_features:
+    for feature_path, feature in self._flat_features:
       stacked_table_name = self._table_to_stacked_table_offset[
           feature.table.name
       ][0]
-      self._feature_to_sample_offset[feature.name] = (
+      self._feature_to_sample_offset[feature_path] = (
           self._table_to_sample_count[stacked_table_name]
       )
       self._table_to_sample_count[stacked_table_name] += functools.reduce(
@@ -651,12 +666,9 @@ def _create_variables_and_slots(
     """
     variables = {}
     for stacked_table_name, tables in self._stacked_table_to_tables.items():
-      with variable_scope.variable_creator_scope(
-          make_sharded_variable_creator(self._strategy)
-      ):
-        variables[stacked_table_name] = self._create_variables(
-            tables, stacked_table_name=stacked_table_name
-        )
+      variables[stacked_table_name] = self._create_variables(
+          tables, stacked_table_name=stacked_table_name
+      )
     return variables
 
   def _maybe_build(self):
@@ -1033,13 +1045,13 @@ def _preprocess_inputs_and_weights_to_coo_tensor(
     table_to_list_of_coos = {
         table_name: ([], [], []) for table_name in stacked_table_to_tables
     }
-    for inp, weight, (_, feature) in zip(
+    for inp, weight, (feature_path, feature) in zip(
         flat_inputs, flat_weights, flat_features
     ):
       table_name, col_offset, col_shift = table_to_stacked_table_offset[
           feature.table.name
       ]
-      row_offset = feature_to_sample_offset[feature.name]
+      row_offset = feature_to_sample_offset[feature_path]
       # Consider making this into one op per table rather than per feature?
       row_ids, col_ids, gains = TPUEmbeddingV2._convert_input_feature_to_coo(
           inp,
@@ -1312,13 +1324,10 @@ def enqueue(
           flat_weights=flat_weights,
       )
     elif device is None:
-      # This is used by keras function tracing.
+      # This is used by keras function tracing. Use any of the TPU devices
+      # and trace once for a single device.
       tpu_devices = self._strategy.extended._tpu_devices  # pylint:disable=protected-access
-      num_replicas, num_cores_per_replica = tpu_devices.shape
-      if num_replicas > 1 or num_cores_per_replica > 1:
-        raise NotImplementedError(
-            "SPMD is not implemented, use strategy.run instead."
-        )
+
       with ops.device(device_util.get_host_for_device(tpu_devices[0][0])):
         return TPUEmbeddingV2.preprocess_features(
             num_replicas_in_sync=self._strategy.num_replicas_in_sync,
@@ -1719,14 +1728,14 @@ def _experimental_preprocess_inputs_and_weights_to_list_of_coo_tensors(
         )
         for table_name in stacked_table_to_tables
     }
-    for inp, weight, (_, feature) in zip(
+    for inp, weight, (feature_path, feature) in zip(
         flat_inputs, flat_weights, flat_features
     ):
       table_name, col_offset, col_shift = table_to_stacked_table_offset[
           feature.table.name
       ]
       stacked_table_sample_count = stacked_table_to_sample_count[table_name]
-      row_offset = feature_to_sample_offset[feature.name]
+      row_offset = feature_to_sample_offset[feature_path]
       # Consider making this into one op per table rather than per feature?
       row_ids_list, col_ids_list, gains_list, sample_count = (
           TPUEmbeddingV2._experimental_convert_input_feature_to_list_of_coo_tensors(
@@ -2208,12 +2217,13 @@ def is_checkpoint_initial_value(initial_value: Any) -> bool:
 
 
 def make_sharded_variable_creator(
-    strategy: distribute_lib.Strategy,
+    strategy: distribute_lib.Strategy, shape_is_local: bool
 ) -> Callable[..., Any]:
   """Create a variable creator which shards across all the tpu device.
 
   Args:
     strategy: a TPUStrategy object.
+    shape_is_local: If the shape to the creator is per replica.
 
   Returns:
     The sharded variable creator.
@@ -2255,7 +2265,8 @@ def _create_sharded_variable(next_creator, *args, **kwargs):
       )
 
     partition_shape = shape.as_list()
-    partition_shape[shard_dim] = partition_shape[shard_dim] // num_devices
+    if not shape_is_local:
+      partition_shape[shard_dim] = partition_shape[shard_dim] // num_devices
 
     unwrapped_arg_spec = tf_inspect.getargspec(unwrapped_initial_value)
     sharding_aware = "shard_info" in unwrapped_arg_spec.args
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 4fa6c57f37fcf5..92ec7cff402129 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1371,7 +1371,9 @@ def export_meta_graph(self,
     # pylint: enable=line-too-long
     return export_meta_graph(
         filename=filename,
-        graph_def=ops.get_default_graph().as_graph_def(add_shapes=True),
+        graph_def=ops.get_default_graph().as_graph_def(
+            add_shapes=True, use_pybind11_proto=True
+        ),
         saver_def=self.saver_def,
         collection_list=collection_list,
         as_text=as_text,
@@ -1379,7 +1381,8 @@ def export_meta_graph(self,
         clear_devices=clear_devices,
         clear_extraneous_savers=clear_extraneous_savers,
         strip_default_attrs=strip_default_attrs,
-        save_debug_info=save_debug_info)
+        save_debug_info=save_debug_info,
+    )
 
   def restore(self, sess, save_path):
     """Restores previously saved variables.
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a1531f55ccaf14..2dc79e13f30602 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1395,7 +1395,11 @@ def tf_gen_op_wrapper_py(
             is invalid to specify both "hidden" and "op_allowlist".
         cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
             specified ops.
-        api_def_srcs: undocumented.
+        api_def_srcs: a list of targets that defines the attributes of API endpoints
+            for this target. For an api_def file to take effect it must be included
+            (transitively) from this list.
+            For example, `visibility: HIDDEN` in the api_def hides the Op from
+            the tf.* namespace.
         compatible_with: undocumented.
         testonly: undocumented.
         copts: undocumented.
@@ -1751,7 +1755,8 @@ def tf_cc_tests(
         linkopts = lrt_if_needed(),
         kernels = [],
         create_named_test_suite = False,
-        visibility = None):
+        visibility = None,
+        features = []):
     test_names = []
     for src in srcs:
         test_name = src_to_test_name(src)
@@ -1765,6 +1770,7 @@ def tf_cc_tests(
             linkstatic = linkstatic,
             tags = tags,
             deps = deps,
+            features = features,
             visibility = visibility,
         )
         test_names.append(test_name)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
index 80ad2d590ec7a6..f713288fdb7684 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
@@ -52,7 +52,7 @@ tf_class {
   }
   member_method {
     name: "as_graph_def"
-    argspec: "args=[\'self\', \'from_version\', \'add_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'from_version\', \'add_shapes\', \'use_pybind11_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "as_graph_element"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 2c6a8268e7db64..9f47dc882c609c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1052,10 +1052,6 @@ tf_module {
     name: "conv2d_backprop_input_v2"
     argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
-  member_method {
-    name: "convert_to_coo_tensor"
-    argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "convert_to_tensor"
     argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\', \'dtype_hint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -1420,14 +1416,6 @@ tf_module {
     name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_minibatch_splits_with_physical_replica"
-    argspec: "args=[\'program_key\', \'row_ids\', \'col_ids\', \'gains\', \'sample_count\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_minibatches_in_csr_with_physical_replica"
-    argspec: "args=[\'program_key\', \'row_ids\', \'col_ids\', \'gains\', \'splits\', \'id_counts\', \'sample_count\', \'num_replica\', \'max_minibatches_per_sc\', \'max_ids_per_chip_per_sample\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_in_csr\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
@@ -2492,14 +2480,6 @@ tf_module {
     name: "to_int64"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
   }
-  member_method {
-    name: "tpu_annotate_tensors_with_dynamic_shape"
-    argspec: "args=[\'tensors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tpu_copy_with_dynamic_shape"
-    argspec: "args=[\'tensors\', \'unpadded_sizes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "trace"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2620,54 +2600,6 @@ tf_module {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "xla_sparse_core_adagrad"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'accumulator\', \'embedding_table\', \'feature_width\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_adagrad_momentum"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'beta_1\', \'epsilon\', \'accumulator\', \'momentum\', \'embedding_table\', \'feature_width\', \'use_nesterov\', \'beta_2\', \'exponent\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_adam"
-    argspec: "args=[\'embedding_table\', \'indices\', \'gradient\', \'learning_rate\', \'momentum\', \'velocity\', \'beta_1\', \'beta_2\', \'epsilon\', \'feature_width\', \'use_sum_inside_sqrt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_ftrl"
-    argspec: "args=[\'embedding_table\', \'accumulator\', \'linear\', \'learning_rate\', \'indices\', \'gradient\', \'beta\', \'learning_rate_power\', \'l2_regularization_strength\', \'feature_width\', \'multiply_linear_by_learning_rate\', \'l1_regularization_strength\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_sgd"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'embedding_table\', \'feature_width\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul"
-    argspec: "args=[\'row_ids\', \'col_ids\', \'values\', \'offsets\', \'embedding_table\', \'max_ids_per_partition\', \'max_unique_ids_per_partition\', \'input_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adagrad_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adagrad_momentum_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adam_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_ftrl_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_sgd_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_with_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zeros"
     argspec: "args=[\'shape\', \'dtype\', \'name\', \'layout\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index adba246f178ae3..9a58ae944315cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -4908,6 +4908,10 @@ tf_module {
     name: "StopGradient"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StoreMinibatchStatisticsInFdo"
+    argspec: "args=[\'program_key\', \'max_ids\', \'max_uniques\', \'sample_count\', \'num_replica\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StridedSlice"
     argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
index e4cc0061a953c8..ba9c2c1c6255cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -17,6 +17,6 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'as_text\', \'experimental_image_format\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 44860b11720e1a..b3485585797c98 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -17,6 +17,6 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'as_text\', \'experimental_image_format\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
index 80ad2d590ec7a6..f713288fdb7684 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
@@ -52,7 +52,7 @@ tf_class {
   }
   member_method {
     name: "as_graph_def"
-    argspec: "args=[\'self\', \'from_version\', \'add_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'from_version\', \'add_shapes\', \'use_pybind11_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "as_graph_element"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
index d2db7d677ae9ac..5fda52fa814afa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "as_graph_def"
-    argspec: "args=[\'self\', \'from_version\', \'add_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'from_version\', \'add_shapes\', \'use_pybind11_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "as_graph_element"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.saved_model.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.saved_model.-saved-model-builder.pbtxt
index 53675d7ff2a44b..758beb49b1427a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.saved_model.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.saved_model.-saved-model-builder.pbtxt
@@ -16,6 +16,6 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'as_text\', \'experimental_image_format\'], varargs=None, keywords=None, defaults=[\'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
index c1a0c6f007ead9..a0b84303e1b1be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "from_string"
     argspec: "args=[\'cls\', \'layout_str\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "global_shape_from_local_shape"
+    argspec: "args=[\'self\', \'local_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "inner_sharded"
     argspec: "args=[\'cls\', \'mesh\', \'inner_dim\', \'rank\'], varargs=None, keywords=None, defaults=None"
@@ -71,6 +75,10 @@ tf_class {
     name: "is_single_device"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "local_shape_from_global_shape"
+    argspec: "args=[\'self\', \'global_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "num_shards"
     argspec: "args=[\'self\', \'idx\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 32e37c21c0be51..70f669cc6906d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -584,10 +584,6 @@ tf_module {
     name: "conv2d_backprop_input_v2"
     argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
-  member_method {
-    name: "convert_to_coo_tensor"
-    argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "convert_to_tensor"
     argspec: "args=[\'value\', \'dtype\', \'dtype_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -728,14 +724,6 @@ tf_module {
     name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_minibatch_splits_with_physical_replica"
-    argspec: "args=[\'program_key\', \'row_ids\', \'col_ids\', \'gains\', \'sample_count\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_minibatches_in_csr_with_physical_replica"
-    argspec: "args=[\'program_key\', \'row_ids\', \'col_ids\', \'gains\', \'splits\', \'id_counts\', \'sample_count\', \'num_replica\', \'max_minibatches_per_sc\', \'max_ids_per_chip_per_sample\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_in_csr\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_static_value"
     argspec: "args=[\'tensor\', \'partial\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -1180,14 +1168,6 @@ tf_module {
     name: "timestamp"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tpu_annotate_tensors_with_dynamic_shape"
-    argspec: "args=[\'tensors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tpu_copy_with_dynamic_shape"
-    argspec: "args=[\'tensors\', \'unpadded_sizes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "transpose"
     argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
@@ -1244,54 +1224,6 @@ tf_module {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "xla_sparse_core_adagrad"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'accumulator\', \'embedding_table\', \'feature_width\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_adagrad_momentum"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'beta_1\', \'epsilon\', \'accumulator\', \'momentum\', \'embedding_table\', \'feature_width\', \'use_nesterov\', \'beta_2\', \'exponent\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_adam"
-    argspec: "args=[\'embedding_table\', \'indices\', \'gradient\', \'learning_rate\', \'momentum\', \'velocity\', \'beta_1\', \'beta_2\', \'epsilon\', \'feature_width\', \'use_sum_inside_sqrt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_ftrl"
-    argspec: "args=[\'embedding_table\', \'accumulator\', \'linear\', \'learning_rate\', \'indices\', \'gradient\', \'beta\', \'learning_rate_power\', \'l2_regularization_strength\', \'feature_width\', \'multiply_linear_by_learning_rate\', \'l1_regularization_strength\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_core_sgd"
-    argspec: "args=[\'indices\', \'gradient\', \'learning_rate\', \'embedding_table\', \'feature_width\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul"
-    argspec: "args=[\'row_ids\', \'col_ids\', \'values\', \'offsets\', \'embedding_table\', \'max_ids_per_partition\', \'max_unique_ids_per_partition\', \'input_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adagrad_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adagrad_momentum_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_adam_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_ftrl_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_grad_with_sgd_and_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "xla_sparse_dense_matmul_with_csr_input"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zeros"
     argspec: "args=[\'shape\', \'dtype\', \'name\', \'layout\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index adba246f178ae3..9a58ae944315cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -4908,6 +4908,10 @@ tf_module {
     name: "StopGradient"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StoreMinibatchStatisticsInFdo"
+    argspec: "args=[\'program_key\', \'max_ids\', \'max_uniques\', \'sample_count\', \'num_replica\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'mini_batch_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StridedSlice"
     argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 65e89a13bd2914..ec912d7d8146c4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -45,3 +45,6 @@ RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
+
+COPY install/install_clang_17.sh /install/
+RUN /install/install_clang_17.sh
diff --git a/third_party/xla/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl b/tensorflow/tools/ci_build/install/install_clang_17.sh
old mode 100644
new mode 100755
similarity index 55%
rename from third_party/xla/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl
rename to tensorflow/tools/ci_build/install/install_clang_17.sh
index 88933dc011dcc1..50d2fcd0f716f5
--- a/third_party/xla/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl
+++ b/tensorflow/tools/ci_build/install/install_clang_17.sh
@@ -1,4 +1,5 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#!/bin/bash -eu
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,4 +14,18 @@
 # limitations under the License.
 # ==============================================================================
 
-config = %{tensorrt_config}
+# LLVM/Clang: https://apt.llvm.org/
+apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
+
+# Set up custom sources
+cat >/etc/apt/sources.list.d/custom.list <<SOURCES
+
+# LLVM/Clang repository
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal-17 main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-17 main
+SOURCES
+
+apt-get update && apt-get install -y \
+    llvm-17 \
+    clang-17 \
+    lld-17
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
index 89dd80803ba9b3..c388f5322abf07 100644
--- a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
+++ b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
@@ -23,9 +23,6 @@ build --action_env MACOSX_DEPLOYMENT_TARGET=12.0
 # Test-related settings below this point.
 test --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
 
-# TODO(b/294367488) disable after 2.15 brancut
-test --flaky_test_attempts=3
-
 # Increase the test timeout as tests often take longer on mac.
 test --test_timeout=300,450,1200,3600
 test --test_size_filters=small,medium
diff --git a/tensorflow/tools/ci_build/release/requirements_common.txt b/tensorflow/tools/ci_build/release/requirements_common.txt
index e9a79b120b5672..da3ea7d765f113 100644
--- a/tensorflow/tools/ci_build/release/requirements_common.txt
+++ b/tensorflow/tools/ci_build/release/requirements_common.txt
@@ -5,18 +5,19 @@ absl-py ~= 1.0.0
 astunparse ~= 1.6.3
 flatbuffers ~= 23.5.26
 google_pasta ~= 0.2
-h5py ~= 3.8.0  # Earliest version for Python 3.11
-ml_dtypes ~= 0.2
+h5py ~= 3.10.0  # Earliest version for Python 3.12
+ml_dtypes ~= 0.3.1
 # TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
 # to support TFX. Remove when Apache Beam upgrades to newer NumPy.
 numpy ~= 1.22.0; python_version < '3.11'
-numpy ~= 1.23.2; python_version >= '3.11' # Earliest version for Python 3.11
+numpy ~= 1.23.2; python_version == '3.11' # Earliest version for Python 3.11
+numpy ~= 1.26.0; python_version >= '3.12' # Earliest version for Python 3.12
 opt_einsum ~= 3.3.0
 protobuf ~= 3.20.3  # NOTE: Earliest version for Python 3.10
 six ~= 1.16.0
 termcolor ~= 2.1.1
-typing_extensions ~= 3.10.0.0
-wheel ~= 0.38.1
+typing_extensions ~= 4.8.0
+wheel ~= 0.41.2
 wrapt ~= 1.14.1
 
 # We need to pin the gast dependency exactly
@@ -31,14 +32,15 @@ tb-nightly ~= 2.14.0.a
 tf-estimator-nightly ~= 2.14.0.dev
 
 # Test dependencies
-grpcio ~= 1.49.1 # Earliest version for Python 3.11
-portpicker ~= 1.5.2
+grpcio ~= 1.59.0 # Earliest version for Python 3.12
+portpicker ~= 1.6.0
 scipy ~= 1.7.2; python_version < '3.11'
-scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
+scipy ~= 1.9.2; python_version == '3.11' # Earliest version for Python 3.11
+scipy ~= 1.11.3; python_version >= '3.12' # Earliest version for Python 3.12
 
 # This is usually vendored in setuptools but ensure it gets installed in CI anyway
 # No bound here, we prefer the one in setuptools
 packaging
 
 # For using Python 3.11 with Bazel 6 (b/286090018)
-lit ~= 16.0.5.post0
+lit ~= 17.0.2
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index f058acb5c1d0e4..ed0576a436c728 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -60,14 +60,18 @@ rm -f ${DIR}/tensorflow_jni.dll
 mkdir -p ${DIR}/include/tensorflow/c
 mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/include/tensorflow/core/platform
+mkdir -p ${DIR}/include/tsl/c
+mkdir -p ${DIR}/include/tsl/platform
 mkdir -p ${DIR}/lib
 cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
 cp bazel-bin/tensorflow/tensorflow.lib ${DIR}/lib/tensorflow.lib
 cp tensorflow/c/c_api.h \
   tensorflow/c/tf_attrtype.h \
+  tensorflow/c/tf_buffer.h  \
   tensorflow/c/tf_datatype.h \
   tensorflow/c/tf_status.h \
   tensorflow/c/tf_tensor.h \
+  tensorflow/c/tf_tensor_helper.h \
   tensorflow/c/tf_tstring.h \
   tensorflow/c/tf_file_statistics.h \
   tensorflow/c/tensor_interface.h \
@@ -81,6 +85,11 @@ cp tensorflow/c/eager/c_api.h \
 cp tensorflow/core/platform/ctstring.h \
   tensorflow/core/platform/ctstring_internal.h \
   ${DIR}/include/tensorflow/core/platform
+cp third_party/xla/third_party/tsl/tsl/c/tsl_status.h \
+   ${DIR}/include/tsl/c
+cp third_party/xla/third_party/tsl/tsl/platform/ctstring.h \
+   third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h \
+   ${DIR}/include/tsl/platform
 cp LICENSE ${DIR}/LICENSE
 cp bazel-bin/tensorflow/tools/lib_package/THIRD_PARTY_TF_C_LICENSES ${DIR}/
 cd ${DIR}
@@ -92,9 +101,11 @@ zip libtensorflow-cpu-windows-$(uname -m).zip \
   include/tensorflow/c/eager/dlpack.h \
   include/tensorflow/c/c_api.h \
   include/tensorflow/c/tf_attrtype.h \
+  include/tensorflow/c/tf_buffer.h  \
   include/tensorflow/c/tf_datatype.h \
   include/tensorflow/c/tf_status.h \
   include/tensorflow/c/tf_tensor.h \
+  include/tensorflow/c/tf_tensor_helper.h \
   include/tensorflow/c/tf_tstring.h \
   include/tensorflow/c/tf_file_statistics.h \
   include/tensorflow/c/tensor_interface.h \
@@ -102,6 +113,9 @@ zip libtensorflow-cpu-windows-$(uname -m).zip \
   include/tensorflow/c/c_api_experimental.h \
   include/tensorflow/core/platform/ctstring.h \
   include/tensorflow/core/platform/ctstring_internal.h \
+  include/tsl/c/tsl_status.h \
+  include/tsl/platform/ctstring.h \
+  include/tsl/platform/ctstring_internal.h \
   LICENSE \
   THIRD_PARTY_TF_C_LICENSES
 rm -rf lib include
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a01549f9c23477..8874acfff25678 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -176,7 +176,7 @@ def standard_or_nightly(standard, nightly):
     'nvidia-curand-cu12 == 10.3.3.141',
     'nvidia-cusolver-cu12 == 11.5.2.141',
     'nvidia-cusparse-cu12 == 12.1.2.141',
-    'nvidia-nccl-cu12 == 2.16.5',
+    'nvidia-nccl-cu12 == 2.18.3',
     'nvidia-nvjitlink-cu12 == 12.2.140',
     'tensorrt == 8.6.1.post1',
     'tensorrt-bindings == 8.6.1',
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
index c3101ee5879051..6abc9469533d7e 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
@@ -185,7 +185,7 @@ esac
 # TODO(klimek): Automate linking in all non-gcc / non-kernel include
 # directories.
 mkdir -p "/${TARGET}/usr/include/x86_64-linux-gnu"
-PYTHON_VERSIONS=("python3.9" "python3.10" "python3.11")
+PYTHON_VERSIONS=("python3.9" "python3.10" "python3.11" "python3.12")
 for v in "${PYTHON_VERSIONS[@]}"; do
   ln -s "/usr/local/include/${v}" "/${TARGET}/usr/include/x86_64-linux-gnu/${v}"
 done
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index f0f05bef74639f..9fcd8714b28d7b 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -11,8 +11,10 @@ libcufft-12-2
 libcurand-12-2
 libcusolver-dev-12-2
 libcusparse-dev-12-2
+libcublas-12-2
 libcublas-dev-12-2
 libnccl-dev=2.18.5-1+cuda12.2
+libnccl2=2.18.5-1+cuda12.2
 # CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
 libcudnn8-dev=8.9.4.25-1+cuda12.2
 libcudnn8=8.9.4.25-1+cuda12.2
@@ -28,6 +30,8 @@ autoconf
 automake
 build-essential
 ca-certificates
+# TODO(b/308399490) Remove CMake once dm-tree (Keras dependency) has 3.12 wheels
+cmake
 llvm-17
 clang-17
 lld-17
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index a7a07b495a75e8..eae2be62f83ae5 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -8,19 +8,21 @@ absl-py ~= 1.0.0
 astunparse ~= 1.6.3
 flatbuffers ~= 23.5.26
 google_pasta ~= 0.2
-h5py ~= 3.8.0 # Earliest version for Python 3.11
-ml_dtypes ~= 0.2
+h5py ~= 3.10.0 # Earliest version for Python 3.12
+ml_dtypes ~= 0.3.1
 # TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
 # to support TFX. Remove when Apache Beam upgrades to newer NumPy.
 numpy ~= 1.22.0; python_version < '3.11'
-numpy ~= 1.23.2; python_version >= '3.11' # Earliest version for Python 3.11
+numpy ~= 1.23.2; python_version == '3.11' # Earliest version for Python 3.11
+numpy ~= 1.26.0; python_version >= '3.12' # Earliest version for Python 3.12
 opt_einsum ~= 3.3.0
-packaging ~= 21.3
+packaging ~= 23.2
 protobuf ~= 3.20.3
 six ~= 1.16.0
 termcolor ~= 2.1.1
-typing_extensions ~= 3.10.0.0
-wheel ~= 0.38.1
+typing_extensions ~= 4.8.0
+wheel ~= 0.41.2
+setuptools >= 68.2.2
 wrapt ~= 1.14.1
 # We need to pin the gast dependency exactly
 gast == 0.4.0
@@ -34,13 +36,14 @@ keras-nightly ~= 2.14.0.dev
 tb-nightly ~= 2.13.0.a
 tf-estimator-nightly ~= 2.14.0.dev
 # Test dependencies
-grpcio ~= 1.49.1 # Earliest version for Python 3.11
-portpicker ~= 1.5.2
+grpcio ~= 1.59.0 # Earliest version for Python 3.12
+portpicker ~= 1.6.0
 scipy ~= 1.7.2; python_version < '3.11'
-scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
+scipy ~= 1.9.2; python_version == '3.11' # Earliest version for Python 3.11
+scipy ~= 1.11.3; python_version >= '3.12' # Earliest version for Python 3.12
 # Required for TFLite import from JAX tests
-jax ~= 0.3.25
-jaxlib ~= 0.3.25 # Earliest version for Python 3.11
+jax ~= 0.3.25; python_version <= '3.11'
+jaxlib ~= 0.3.25; python_version <= '3.11' # Earliest version for Python 3.11
 # Needs to be addressed. Unblocked 2.4 branchcut cl/338377048
 PyYAML ~= 6.0
 # For uploading
@@ -53,4 +56,4 @@ pylint ~= 2.13.9
 urllib3<2
 
 # For using Python 3.11 with Bazel 6 (b/286090018)
-lit ~= 16.0.5.post0
+lit ~= 17.0.2
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
index 1ecb606ac6bfe5..14b75645a85fab 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
@@ -29,7 +29,7 @@ build --copt=-mavx --host_copt=-mavx
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -67,14 +67,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.14_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.14_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.14_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.14_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.16_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.16_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.16_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.16_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/setup.python.sh b/tensorflow/tools/tf_sig_build_dockerfiles/setup.python.sh
index c38153a29e4647..765ba3dffcf944 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/setup.python.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/setup.python.sh
@@ -98,6 +98,7 @@ fi
 curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 python3 get-pip.py
 python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install -U setuptools
 
 # Disable the cache dir to save image space, and install packages
 python3 -m pip install --no-cache-dir -r $REQUIREMENTS -U
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 512dcf98a68673..e8fc081f0af511 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -602,3 +602,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 1b540ed47fb4d3..bfb4634e810328 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -1,11 +1,11 @@
 """Docker images used with remote config and RBE."""
 
-"""SHA 256 values for each image."""
+# SHA 256 values for each image.
 container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:77234e5750afcf85c08e8980eff2e8c58ba207a0c32b06a372cafb687d144d2b",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index c74a2e13d82dfd..9b092a10bf3310 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -5,7 +5,6 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
-load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 # buildifier: disable=unnamed-macro
@@ -13,10 +12,9 @@ def workspace(with_rules_cc = True):
     """Loads a set of TensorFlow dependencies. To be used in a WORKSPACE file.
 
     Args:
-      with_rules_cc: whether to load and patch rules_cc repository.
+      with_rules_cc: Unused, to be removed soon.
     """
     native.register_toolchains("@local_config_python//:py_toolchain")
-    rules_cuda_dependencies(with_rules_cc)
     rules_pkg_dependencies()
 
     closure_repositories()
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 7a35a4e4d070e3..65074788800b78 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f9c5e1cf1bcc7920985df92322b95e537f284914339c0836e91c352f51345182",
-        strip_prefix = "XNNPACK-bbbaa7352a3ea729987d3e654d37be93e8009691",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/bbbaa7352a3ea729987d3e654d37be93e8009691.zip"),
+        sha256 = "88e0158aff1e1498e34dfcaf08d948a73a3246a04fe96e548da71f6b9245a009",
+        strip_prefix = "XNNPACK-c7e7cde37615a81a529c326aa278bfab4cd6fe5a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/c7e7cde37615a81a529c326aa278bfab4cd6fe5a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -507,9 +507,9 @@ def _tf_repositories():
         name = "nccl_archive",
         build_file = "//third_party:nccl/archive.BUILD",
         patch_file = ["//third_party/nccl:archive.patch"],
-        sha256 = "0e3d7b6295beed81dc15002e88abf7a3b45b5c686b13b779ceac056f5612087f",
-        strip_prefix = "nccl-2.16.5-1",
-        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.16.5-1.tar.gz"),
+        sha256 = "16ac98f3e926c024ce48e10ab220e19ce734adc48c423cfd55ad6f509bd1179f",
+        strip_prefix = "nccl-2.18.5-1",
+        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.18.5-1.tar.gz"),
     )
 
     java_import_external(
diff --git a/third_party/ducc/ducc.BUILD b/third_party/ducc/ducc.BUILD
index d21b44ff57d460..8d713928f0f245 100644
--- a/third_party/ducc/ducc.BUILD
+++ b/third_party/ducc/ducc.BUILD
@@ -67,9 +67,7 @@ cc_library(
     copts = DUCC_COPTS,
     features = ["-use_header_modules"],
     include_prefix = "ducc",
-    # includes = [
-    #     ".",  # Needed for relative paths.
-    # ],
+    licenses = ["notice"],
     visibility = ["//visibility:public"],
     deps = [
         ":fft",
diff --git a/third_party/ducc/ducc0_custom_lowlevel_threading.h b/third_party/ducc/ducc0_custom_lowlevel_threading.h
index 6ac63a9080ccff..688efe75b59585 100644
--- a/third_party/ducc/ducc0_custom_lowlevel_threading.h
+++ b/third_party/ducc/ducc0_custom_lowlevel_threading.h
@@ -27,7 +27,7 @@ using LockGuard = tsl::mutex_lock;
 using CondVar = tsl::condition_variable;
 
 // Missing variable used by DUCC threading.cc.
-static thread_local bool in_parallel_region = false;
+extern thread_local bool in_parallel_region;
 
 }  // namespace detail_threading
 }  // namespace ducc0
diff --git a/third_party/ducc/threading.cc b/third_party/ducc/threading.cc
index 0ab179f80a5aac..d0793984ab5dbb 100644
--- a/third_party/ducc/threading.cc
+++ b/third_party/ducc/threading.cc
@@ -12,45 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "ducc/google/threading.h"
 
 #include <thread>
+#include <utility>
 
 #include "ducc/src/ducc0/infra/threading.h"
 #include "unsupported/Eigen/CXX11/ThreadPool"
 
 namespace ducc0 {
-namespace google {
-namespace {
 
-// Pseudo thread-pool for single-threaded execution.
-class NoThreadPool : public ducc0::detail_threading::thread_pool {
- public:
-  size_t nthreads() const override { return 1; }
-  size_t adjust_nthreads(size_t nthreads_in) const override { return 1; };
-  void submit(std::function<void()> work) override { work(); }
-};
-
-// Thread-pool wrapper around Eigen's ThreadPool.
-class EigenThreadPool : public ducc0::detail_threading::thread_pool {
- public:
-  EigenThreadPool(Eigen::ThreadPoolInterface& pool) : pool_{&pool} {}
-  size_t nthreads() const override { return pool_->NumThreads(); }
-  size_t adjust_nthreads(size_t nthreads_in) const override {
-    // If called by a thread in the pool, return 1
-    if (pool_->CurrentThreadId() >= 0) {
-      return 1;
-    } else if (nthreads_in == 0) {
-      return pool_->NumThreads();
-    }
-    return std::min<size_t>(nthreads_in, pool_->NumThreads());
-  };
-  void submit(std::function<void()> work) override {
-    pool_->Schedule(std::move(work));
-  }
+namespace google {
 
- private:
-  Eigen::ThreadPoolInterface* pool_;
-};
+namespace {
 
 // Default shared global pool.  It is created on first use.
 EigenThreadPool* GetGlobalThreadPoolSingleton() {
@@ -72,6 +46,9 @@ ducc0::detail_threading::thread_pool*& GetActiveThreadPoolSingleton() {
 // Implementations required by ducc0.
 namespace detail_threading {
 
+// Missing variable used by DUCC threading.cc.
+thread_local bool in_parallel_region = false;
+
 thread_pool* set_active_pool(thread_pool* new_pool) {
   return std::exchange(ducc0::google::GetActiveThreadPoolSingleton(), new_pool);
 }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 41c19b57550b2c..dfa5a198cce1de 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "49af6502c6dcb4a7f7520178bd14df396f78240c"
-    LLVM_SHA256 = "3900937d00f7f9a462469288d25bccb8c75544967ded15848fdda4dd1837b86a"
+    LLVM_COMMIT = "01828c4323172db5901ac3e959d52553b2bd74e5"
+    LLVM_SHA256 = "d0f066d90b4ddb1bfd8ec9d8e4937ce6c624f04d826306dc9f41a866cf333a91"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 05293fd50bdcf2..1813bac9e55728 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -19,7 +19,7 @@ load(
 
 NCCL_MAJOR = 2
 
-NCCL_MINOR = 16
+NCCL_MINOR = 18
 
 NCCL_PATCH = 5
 
@@ -210,6 +210,10 @@ cuda_library(
     ],
     include_prefix = "third_party/nccl",
     linkopts = ["-lrt"],
+    # The following definition is needed to enable placeholder literals such as
+    # PRIx64 defined at the inttypes.h since Tensorflow docker image uses
+    # an old version of glibc.
+    local_defines = ["__STDC_FORMAT_MACROS"],
     strip_include_prefix = "src",
     target_compatible_with = select({
         "@local_config_cuda//cuda:using_clang": [],
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index f951a6a4dde608..8ef0af95a1c6c3 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -30,19 +30,6 @@ diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrec
 similarity index 100%
 rename from src/collectives/device/sendrecv.cu
 rename to src/collectives/device/sendrecv.cu.cc
-diff --git a/src/include/nvtx.h b/src/include/nvtx.h
-index 2aeb932..cdc67d2 100644
---- a/src/include/nvtx.h
-+++ b/src/include/nvtx.h
-@@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
-
- class payload_schema {
-  public:
--  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-   {
-     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
 diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
 index accf8371a..4ab1bfac6 100644
 --- a/src/collectives/device/common.h
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 85792ae3fc8702..820d2dea0fb42f 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -980,6 +980,25 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +}  // namespace mlir
 +
 +#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -1543,6 +1543,7 @@
+     p << " across dimensions = [";
+     llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
+     p << "]";
++    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
+     p << " : ";
+     p.printFunctionalType(*this);
+   } else {
+@@ -1705,6 +1706,7 @@
+   if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
+       parser.parseEqual() ||
+       parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
++      parser.parseOptionalAttrDict(result.attributes) ||
+       parser.parseColon() || parser.parseType(reduceOpFnType) ||
+       parser.parseOptionalLocationSpecifier(explicitLoc))
+     return failure();
 diff --ruN a/stablehlo/stablehlo/tests/infer_stablehlo.mlir b/stablehlo/stablehlo/tests/infer_stablehlo.mlir
 --- stablehlo/stablehlo/tests/infer_stablehlo.mlir
 +++ stablehlo/stablehlo/tests/infer_stablehlo.mlir
@@ -1078,6 +1097,25 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo.mlir b/stablehlo/stablehlo/
  }>
  
  func.func @is_compatible_sparse_mix_non_sparse(%arg0: tensor<1xf32>, %arg1: tensor<1xf32, #SV>) {
+diff --ruN a/stablehlo/stablehlo/tests/print_reduce.mlir b/stablehlo/stablehlo/tests/print_reduce.mlir
+--- stablehlo/stablehlo/tests/print_reduce.mlir
++++ stablehlo/stablehlo/tests/print_reduce.mlir
+@@ -168,3 +168,15 @@
+ 
+   func.return %0: tensor<4xf32>
+ }
++
++// The test case makes sure any custom attrs set on the reduce-op are
++// printed/parsed when pretty-printed.
++
++// CHECK-LABEL:  func @pretty_print_with_custom_attr
++// CHECK:          applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64}
++
++func.func @pretty_print_with_custom_attr(%arg0: tensor<2x64x13xf32>) -> tensor<2x13xf32> {
++  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
++  %1 = stablehlo.reduce(%arg0 init: %0) applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64} : (tensor<2x64x13xf32>, tensor<f32>) -> tensor<2x13xf32>
++  return %1 : tensor<2x13xf32>
++}
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index af9d9dcce05374..fb0399f9e1bdb2 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "f6b5570b2e04978d6362a0f307982c56fb0e01cd"
-    TFRT_SHA256 = "d16308e600db822a7b2e21921febc5d34cc3fcf0197739e2c258b32083cc5da4"
+    TFRT_COMMIT = "7c586554ca927be65ee8b3bfa891998b68591edf"
+    TFRT_SHA256 = "35d278957bd8fea1e583e6b977d389e5f4472c2cae60d7ab46f76a3a49c380a1"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/cl576548341.patch b/third_party/triton/cl576548341.patch
new file mode 100644
index 00000000000000..3efc805515dbb5
--- /dev/null
+++ b/third_party/triton/cl576548341.patch
@@ -0,0 +1,16 @@
+diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+@@ -362,8 +362,10 @@ Value addStringToModule(Location loc, Co
+   }
+ 
+   Value zero = i32_val(0);
+-  Value globalPtr =
+-      rewriter.create<LLVM::AddressOfOp>(UnknownLoc::get(ctx), global);
++  Type globalPtrType =
++      LLVM::LLVMPointerType::get(globalType, global.getAddrSpace());
++  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
++      UnknownLoc::get(ctx), globalPtrType, global.getSymName());
+   Value stringStart =
+       rewriter.create<LLVM::GEPOp>(UnknownLoc::get(ctx), ptr_ty(i8_ty),
+                                    globalPtr, SmallVector<Value>({zero, zero}));
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 150c30b6d20e43..2c6fc5fdba7dc6 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl574102539"
-    TRITON_SHA256 = "6c967d80f3044991b09ed309e668858cfc5d772899e410e918cf5cb463524744"
+    TRITON_COMMIT = "cl575842988"
+    TRITON_SHA256 = "caa815ec863182eb3745fdc0884f521d622aa2b37be521b850f7ea330cadc923"
 
     tf_http_archive(
         name = "triton",
@@ -17,5 +17,6 @@ def repo():
         patch_file = [
             "//third_party/triton:cl568176943.patch",
             "//third_party/triton:b304456327.patch",
+            "//third_party/triton:cl576548341.patch",
         ],
     )
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 70353785170768..e3d0a021f55ae5 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -55,6 +55,7 @@
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
+#     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
 #     rbe_win_py39: Windows Python 3.9 RBE config
 #
@@ -237,9 +238,12 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
-# TODO(angerson, perfinion): What does sm_ vs compute_ mean? How can users
-# select a good value for this? See go/tf-pip-cuda
-build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+# `compute_XY` enables PTX embedding in addition to SASS. PTX
+# is forward compatible beyond the current compute capability major
+# release while SASS is only forward compatible inside the current
+# major release. Example: sm_80 kernels can run on sm_89 GPUs but
+# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
+build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
@@ -249,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -482,12 +486,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -496,7 +500,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -517,11 +521,40 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
+build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
+build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
+build:rbe_linux_cuda_nvcc --config=tensorrt
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
+build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
+build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --config=rbe_linux
+build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
+build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
+# These you may need to change for your own GCP project.
+common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
+build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
+build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
+test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
 build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
@@ -576,8 +609,6 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
-# TODO(b/294367488) disable after 2.15 brancut
-test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
@@ -615,7 +646,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index edced7abf086c5..89071ee8da96db 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -26,6 +26,10 @@ function is_linux_gpu_job() {
   [[ "$KOKORO_JOB_NAME" =~ tensorflow/xla/linux/.*gpu.* ]]
 }
 
+function is_use_nvcc() {
+  [[ -z "${USE_NVCC:-}" ]] || [[ "$USE_NVCC" == "true" ]]
+}
+
 # Pull the container (in case it was updated since the instance started) and
 # store its SHA in the Sponge log.
 docker pull "$DOCKER_IMAGE"
@@ -44,16 +48,23 @@ RC_FILE="/usertools/cpu.bazelrc"
 TARGET_FILTER=""
 TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial"
 ADDITIONAL_FLAGS=""
+RBE_CONFIG=""
 
 if is_linux_gpu_job ; then
     TAGS_FILTER="$TAGS_FILTER,gpu,requires-gpu-nvidia,-no_gpu"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
     RC_FILE="/usertools/gpu.bazelrc"
+    if is_use_nvcc ; then
+      RBE_CONFIG="rbe_linux_cuda_nvcc"
+    else
+      RBE_CONFIG="rbe_linux_cuda"
+    fi
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi
 else
     TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=nonccl"
+    RBE_CONFIG="rbe_linux_cpu"
 fi
 
 # Build & test XLA
@@ -65,7 +76,7 @@ docker exec xla bazel --bazelrc=$RC_FILE \
         --features=layering_check \
         --profile=/tf/pkg/profile.json.gz \
         --flaky_test_attempts=3 \
-        --config=rbe \
+        --config=$RBE_CONFIG \
         --jobs=150 \
         --nobuild_tests_only \
         $ADDITIONAL_FLAGS \
diff --git a/third_party/xla/configure.py b/third_party/xla/configure.py
index 672920548d956c..b2b30dfa2956be 100644
--- a/third_party/xla/configure.py
+++ b/third_party/xla/configure.py
@@ -31,7 +31,7 @@
 
 _DEFAULT_CUDA_VERSION = '11'
 _DEFAULT_CUDNN_VERSION = '2'
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '5.2,7.0'
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -688,8 +688,9 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         ' binary GPU code, or as "sm_xy" to only include the binary '
         'code.\nPlease note that each additional compute capability '
         'significantly increases your build time and binary size, and that '
-        'XLA only supports compute capabilities >= 3.5 [Default is: '
-        '%s]: ' % default_cuda_compute_capabilities)
+        'XLA only supports compute capabilities >= 5.2 [Default is: '
+        '%s]: ' % default_cuda_compute_capabilities
+    )
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
@@ -701,7 +702,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        # We now support sm_35,sm_50,sm_60,compute_70.
+        # We now support sm_52,compute_70.
         sm_compute_match = re.match('(sm|compute)_?([0-9]+[0-9]+)',
                                     compute_capability)
         if not sm_compute_match:
@@ -709,25 +710,22 @@ def set_tf_cuda_compute_capabilities(environ_cp):
           all_valid = False
         else:
           ver = int(sm_compute_match.group(2))
-          if ver < 30:
+          if ver < 52:
             print(
                 'ERROR: XLA only supports small CUDA compute'
-                ' capabilities of sm_30 and higher. Please re-specify the list'
-                ' of compute capabilities excluding version %s.' % ver)
+                ' capabilities of sm_52 and higher. Please re-specify the list'
+                ' of compute capabilities excluding version %s.' % ver
+            )
             all_valid = False
-          if ver < 35:
-            print('WARNING: XLA does not support CUDA compute capabilities '
-                  'lower than sm_35. Disable XLA when running on older GPUs.')
       else:
         ver = float(m.group(0))
-        if ver < 3.0:
-          print('ERROR: XLA only supports CUDA compute capabilities 3.0 '
-                'and higher. Please re-specify the list of compute '
-                'capabilities excluding version %s.' % ver)
+        if ver < 5.2:
+          print(
+              'ERROR: XLA only supports CUDA compute capabilities 5.2 '
+              'and higher. Please re-specify the list of compute '
+              'capabilities excluding version %s.' % ver
+          )
           all_valid = False
-        if ver < 3.5:
-          print('WARNING: XLA does not support CUDA compute capabilities '
-                'lower than 3.5. Disable XLA when running on older GPUs.')
 
     if all_valid:
       break
@@ -856,7 +854,7 @@ def maybe_encode_env(env):
 
   find_cuda_script = os.path.join(
       pathlib.Path(__file__).parent.resolve(),
-      'third_party/gpus/find_cuda_config.py',
+      'third_party/tsl/third_party/gpus/find_cuda_config.py',
   )
   if not os.path.isfile(find_cuda_script):
     raise FileNotFoundError(
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 5aba0ccc1c70b4..9abb2546fa24ed 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -3,69 +3,15 @@ compiler/xla/mlir_hlo/WORKSPACE:
 compiler/xla/stream_executor/build_defs.bzl:
 third_party/BUILD:
 third_party/__init__:.py
-third_party/absl/com_google_absl.BUILD:
-third_party/clang_toolchain/BUILD:
-third_party/clang_toolchain/cc_configure_clang.bzl:
-third_party/clang_toolchain/download_clang.bzl:
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
-third_party/curl.BUILD:
-third_party/cython.BUILD:
-third_party/eigen3/BUILD:
-third_party/eigen3/LICENSE:
-third_party/eigen3/eigen_archive.BUILD:
-third_party/gif.BUILD:
-third_party/gif_fix_strtok_r.patch:
-third_party/git/BUILD.tpl:
-third_party/git/BUILD:
-third_party/git/git_configure.bzl:
-third_party/gpus/BUILD:
-third_party/gpus/crosstool/BUILD.rocm.tpl:
-third_party/gpus/crosstool/BUILD.tpl:
-third_party/gpus/crosstool/BUILD:
-third_party/gpus/crosstool/LICENSE:
-third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
-third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
-third_party/gpus/cuda/BUILD.tpl:
-third_party/gpus/cuda/BUILD:
-third_party/gpus/cuda/LICENSE:
-third_party/gpus/cuda/build_defs.bzl.tpl:
-third_party/gpus/cuda/cuda_config.h.tpl:
-third_party/gpus/cuda/cuda_config.py.tpl:
-third_party/gpus/cuda_configure.bzl:
-third_party/gpus/find_cuda_config:.py
-third_party/gpus/rocm/BUILD.tpl:
-third_party/gpus/rocm/BUILD:
-third_party/gpus/rocm/build_defs.bzl.tpl:
-third_party/gpus/rocm/rocm_config.h.tpl:
-third_party/gpus/rocm_configure.bzl:
-third_party/grpc/BUILD:
-third_party/implib_so/BUILD:
-third_party/implib_so/get_symbols.py:
-third_party/implib_so/make_stub.py:
 third_party/llvm_openmp/BUILD:
 third_party/llvm_openmp/cmake_vars.bzl:
 third_party/llvm_openmp/expand_cmake_vars:.py
 third_party/llvm_openmp/openmp.bzl:
-third_party/mkl/BUILD:
-third_party/mkl/build_defs.bzl:
-third_party/mkl_dnn/LICENSE:
-third_party/mkl_dnn/build_defs.bzl:
-third_party/mkl_dnn/mkldnn_acl.BUILD:
-third_party/mkl_dnn/mkldnn_v1.BUILD:
-third_party/nccl/BUILD:
-third_party/nccl/LICENSE:
-third_party/nccl/archive.BUILD:
-third_party/nccl/archive.patch:
-third_party/nccl/build_defs.bzl.tpl:
-third_party/nccl/nccl_configure.bzl:
-third_party/nccl/system.BUILD.tpl:
 third_party/ortools/BUILD:
 third_party/ortools/glpk.BUILD:
 third_party/ortools/ortools.patch:
-third_party/png.BUILD:
-third_party/png_fix_rpi.patch:
-third_party/protobuf/BUILD:
 third_party/py/non_hermetic/BUILD.tpl:
 third_party/py/non_hermetic/BUILD:
 third_party/py/non_hermetic/README:
@@ -73,55 +19,9 @@ third_party/py/non_hermetic/ml_dtypes/BUILD:
 third_party/py/non_hermetic/ml_dtypes/LICENSE:
 third_party/py/non_hermetic/numpy/BUILD:
 third_party/py/non_hermetic/python_configure.bzl:
-third_party/pybind11.BUILD:
-third_party/pybind11_bazel/BUILD:
 third_party/python_runtime/BUILD:
-third_party/remote_config/BUILD.tpl:
-third_party/remote_config/BUILD:
-third_party/remote_config/common.bzl:
-third_party/remote_config/remote_platform_configure.bzl:
 third_party/repo.bzl:
-third_party/six.BUILD:
-third_party/snappy.BUILD:
 third_party/stablehlo/BUILD:
-third_party/systemlibs/BUILD.tpl:
-third_party/systemlibs/BUILD:
-third_party/systemlibs/absl_py.BUILD:
-third_party/systemlibs/absl_py.absl.flags.BUILD:
-third_party/systemlibs/absl_py.absl.logging.BUILD:
-third_party/systemlibs/absl_py.absl.testing.BUILD:
-third_party/systemlibs/boringssl.BUILD:
-third_party/systemlibs/build_defs.bzl.tpl:
-third_party/systemlibs/curl.BUILD:
-third_party/systemlibs/cython.BUILD:
-third_party/systemlibs/double_conversion.BUILD:
-third_party/systemlibs/gif.BUILD:
-third_party/systemlibs/google_cloud_cpp.BUILD:
-third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD:
-third_party/systemlibs/grpc.BUILD:
-third_party/systemlibs/jsoncpp.BUILD:
-third_party/systemlibs/lmdb.BUILD:
-third_party/systemlibs/nsync.BUILD:
-third_party/systemlibs/png.BUILD:
-third_party/systemlibs/protobuf.BUILD:
-third_party/systemlibs/protobuf.bzl:
-third_party/systemlibs/re2.BUILD:
-third_party/systemlibs/six.BUILD:
-third_party/systemlibs/snappy.BUILD:
-third_party/systemlibs/sqlite.BUILD:
-third_party/systemlibs/syslibs_configure.bzl:
-third_party/systemlibs/zlib.BUILD:
-third_party/tensorrt/BUILD.tpl:
-third_party/tensorrt/BUILD:
-third_party/tensorrt/LICENSE:
-third_party/tensorrt/build_defs.bzl.tpl:
-third_party/tensorrt/plugin/BUILD:
-third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl:
-third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
-third_party/tensorrt/tensorrt_configure.bzl:
-third_party/tensorrt/workspace.bzl:
-third_party/tf_runtime/BUILD:
-third_party/zlib.BUILD:
 tools/toolchains/BUILD:
 tools/toolchains/clang6/BUILD:
 tools/toolchains/cpus/py/BUILD:
diff --git a/third_party/xla/third_party/absl/BUILD b/third_party/xla/third_party/absl/BUILD
deleted file mode 100644
index 3c413807167aeb..00000000000000
--- a/third_party/xla/third_party/absl/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/absl/absl_designated_initializers.patch b/third_party/xla/third_party/absl/absl_designated_initializers.patch
deleted file mode 100644
index 6ee232238f7ee3..00000000000000
--- a/third_party/xla/third_party/absl/absl_designated_initializers.patch
+++ /dev/null
@@ -1,65 +0,0 @@
-diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
-index 66f784de..ff424c54 100644
---- a/absl/crc/internal/crc_memcpy_x86_64.cc
-+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
-@@ -359,18 +359,18 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
-     case CpuType::kIntelHaswell:
-     case CpuType::kIntelIvybridge:
-       return {
--          .temporal = new FallbackCrcMemcpyEngine(),
--          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
-+          /*.temporal=*/new FallbackCrcMemcpyEngine(),
-+          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-       };
-     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
-     case CpuType::kIntelSandybridge:
-       return {
--          .temporal = new FallbackCrcMemcpyEngine(),
--          .non_temporal = new CrcNonTemporalMemcpyEngine(),
-+          /*.temporal=*/new FallbackCrcMemcpyEngine(),
-+          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
-       };
-     default:
--      return {.temporal = new FallbackCrcMemcpyEngine(),
--              .non_temporal = new FallbackCrcMemcpyEngine()};
-+      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
-+              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
-   }
- #else
-   // Get the underlying architecture.
-@@ -388,8 +388,8 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
-     case CpuType::kAmdRome:
-     case CpuType::kAmdNaples:
-       return {
--          .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
--          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
-+          /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
-+          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-       };
-     // PCLMULQDQ is slow and we don't have wide enough issue width to take
-     // advantage of it.  For an unknown architecture, don't risk using CLMULs.
-@@ -400,18 +400,18 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
-     case CpuType::kIntelHaswell:
-     case CpuType::kIntelIvybridge:
-       return {
--          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
--          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
-+          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
-+          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-       };
-     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
-     case CpuType::kIntelSandybridge:
-       return {
--          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
--          .non_temporal = new CrcNonTemporalMemcpyEngine(),
-+          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
-+          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
-       };
-     default:
--      return {.temporal = new FallbackCrcMemcpyEngine(),
--              .non_temporal = new FallbackCrcMemcpyEngine()};
-+      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
-+              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
-   }
- #endif  // UNDEFINED_BEHAVIOR_SANITIZER
- }
diff --git a/third_party/xla/third_party/absl/com_google_absl.BUILD b/third_party/xla/third_party/absl/com_google_absl.BUILD
deleted file mode 100644
index 8fca145f751eac..00000000000000
--- a/third_party/xla/third_party/absl/com_google_absl.BUILD
+++ /dev/null
@@ -1,5 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache
-
-exports_files(["LICENSE"])
diff --git a/third_party/xla/third_party/absl/invert_the_is_inline_bin.patch b/third_party/xla/third_party/absl/invert_the_is_inline_bin.patch
deleted file mode 100644
index 28c4e9b986a889..00000000000000
--- a/third_party/xla/third_party/absl/invert_the_is_inline_bin.patch
+++ /dev/null
@@ -1,108 +0,0 @@
-From 5c9f72faadaca7250b341b99da358e855a8d902e Mon Sep 17 00:00:00 2001
-From: Abseil Team <absl-team@google.com>
-Date: Tue, 5 Sep 2023 10:45:53 -0700
-Subject: [PATCH] Invert the "is inlined" bit of absl::Status
-
-This change makes  RepToPointer/PointerToRep have 0 instructions.
-This makes IsMovedFrom simpler (although this could always have left out the IsInlined check since that bit can never be set on the aligned pointer)
-
-In exchange, it makes CodeToInlinedRep slower, but does not inhibit replacing it with a constant.
-InlinedRepToCode is unaffected.
-
-PiperOrigin-RevId: 562826801
-Change-Id: I2732f04ab293b773edc2efdec546b3a287b980c2
----
- absl/status/status.cc |  4 ++++
- absl/status/status.h  | 23 +++++++++++++----------
- 2 files changed, 17 insertions(+), 10 deletions(-)
-
-diff --git a/absl/status/status.cc b/absl/status/status.cc
-index 577dea4b..911f4b28 100644
---- a/absl/status/status.cc
-+++ b/absl/status/status.cc
-@@ -46,6 +46,10 @@
- namespace absl {
- ABSL_NAMESPACE_BEGIN
- 
-+static_assert(
-+    alignof(status_internal::StatusRep) >= 4,
-+    "absl::Status assumes it can use the bottom 2 bits of a StatusRep*.");
-+
- std::string StatusCodeToString(StatusCode code) {
-   switch (code) {
-     case StatusCode::kOk:
-diff --git a/absl/status/status.h b/absl/status/status.h
-index 595064c0..2dac2fea 100644
---- a/absl/status/status.h
-+++ b/absl/status/status.h
-@@ -51,10 +51,15 @@
- #ifndef ABSL_STATUS_STATUS_H_
- #define ABSL_STATUS_STATUS_H_
- 
-+#include <cassert>
-+#include <cstdint>
- #include <ostream>
- #include <string>
- #include <utility>
- 
-+#include "absl/base/attributes.h"
-+#include "absl/base/config.h"
-+#include "absl/base/optimization.h"
- #include "absl/functional/function_ref.h"
- #include "absl/status/internal/status_internal.h"
- #include "absl/strings/cord.h"
-@@ -644,13 +649,13 @@ class Status final {
-   std::string ToStringSlow(StatusToStringMode mode) const;
- 
-   // Status supports two different representations.
--  //  - When the low bit is off it is an inlined representation.
-+  //  - When the low bit is set it is an inlined representation.
-   //    It uses the canonical error space, no message or payload.
-   //    The error code is (rep_ >> 2).
-   //    The (rep_ & 2) bit is the "moved from" indicator, used in IsMovedFrom().
--  //  - When the low bit is on it is an external representation.
-+  //  - When the low bit is off it is an external representation.
-   //    In this case all the data comes from a heap allocated Rep object.
--  //    (rep_ - 1) is a status_internal::StatusRep* pointer to that structure.
-+  //    rep_ is a status_internal::StatusRep* pointer to that structure.
-   uintptr_t rep_;
- };
- 
-@@ -839,18 +844,16 @@ inline status_internal::Payloads* Status::GetPayloads() {
-   return IsInlined(rep_) ? nullptr : RepToPointer(rep_)->payloads.get();
- }
- 
--inline bool Status::IsInlined(uintptr_t rep) { return (rep & 1) == 0; }
-+inline bool Status::IsInlined(uintptr_t rep) { return (rep & 1) != 0; }
- 
--inline bool Status::IsMovedFrom(uintptr_t rep) {
--  return IsInlined(rep) && (rep & 2) != 0;
--}
-+inline bool Status::IsMovedFrom(uintptr_t rep) { return (rep & 2) != 0; }
- 
- inline uintptr_t Status::MovedFromRep() {
-   return CodeToInlinedRep(absl::StatusCode::kInternal) | 2;
- }
- 
- inline uintptr_t Status::CodeToInlinedRep(absl::StatusCode code) {
--  return static_cast<uintptr_t>(code) << 2;
-+  return (static_cast<uintptr_t>(code) << 2) + 1;
- }
- 
- inline absl::StatusCode Status::InlinedRepToCode(uintptr_t rep) {
-@@ -860,11 +863,11 @@ inline absl::StatusCode Status::InlinedRepToCode(uintptr_t rep) {
- 
- inline status_internal::StatusRep* Status::RepToPointer(uintptr_t rep) {
-   assert(!IsInlined(rep));
--  return reinterpret_cast<status_internal::StatusRep*>(rep - 1);
-+  return reinterpret_cast<status_internal::StatusRep*>(rep);
- }
- 
- inline uintptr_t Status::PointerToRep(status_internal::StatusRep* rep) {
--  return reinterpret_cast<uintptr_t>(rep) + 1;
-+  return reinterpret_cast<uintptr_t>(rep);
- }
- 
- inline void Status::Ref(uintptr_t rep) {
--- 
-2.25.1
diff --git a/third_party/xla/third_party/absl/system.BUILD b/third_party/xla/third_party/absl/system.BUILD
deleted file mode 100644
index 134d2733273210..00000000000000
--- a/third_party/xla/third_party/absl/system.BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.algorithm.BUILD b/third_party/xla/third_party/absl/system.absl.algorithm.BUILD
deleted file mode 100644
index ffcb03a8206742..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.algorithm.BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-[cc_library(
-    name = n,
-) for n in [
-    "algorithm",
-    "container",
-]]
diff --git a/third_party/xla/third_party/absl/system.absl.base.BUILD b/third_party/xla/third_party/absl/system.absl.base.BUILD
deleted file mode 100644
index d6bf8748deea73..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.base.BUILD
+++ /dev/null
@@ -1,107 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-[cc_library(
-    name = n,
-) for n in [
-    "config",
-    "core_headers",
-    "base_internal",
-    "dynamic_annotations",
-    "atomic_hook",
-    "errno_saver",
-    "fast_type_id",
-    "pretty_function",
-]]
-
-cc_library(
-    name = "log_severity",
-    linkopts = ["-labsl_log_severity"],
-)
-
-cc_library(
-    name = "raw_logging_internal",
-    linkopts = ["-labsl_raw_logging_internal"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        ":log_severity",
-    ],
-)
-
-cc_library(
-    name = "spinlock_wait",
-    linkopts = ["-labsl_spinlock_wait"],
-    visibility = [
-        "//absl/base:__pkg__",
-    ],
-)
-
-cc_library(
-    name = "malloc_internal",
-    linkopts = [
-        "-labsl_malloc_internal",
-        "-pthread",
-    ],
-    deps = [
-        ":base",
-        ":raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "base",
-    linkopts = [
-        "-labsl_base",
-        "-pthread",
-    ],
-    deps = [
-        ":log_severity",
-        ":raw_logging_internal",
-        ":spinlock_wait",
-    ],
-)
-
-cc_library(
-    name = "throw_delegate",
-    linkopts = ["-labsl_throw_delegate"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        ":raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "endian",
-    deps = [
-        ":base",
-    ],
-)
-
-cc_library(
-    name = "exponential_biased",
-    linkopts = ["-labsl_exponential_biased"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "periodic_sampler",
-    linkopts = ["-labsl_periodic_sampler"],
-    deps = [
-        ":exponential_biased",
-    ],
-)
-
-cc_library(
-    name = "strerror",
-    linkopts = ["-labsl_strerror"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.cleanup.BUILD b/third_party/xla/third_party/absl/system.absl.cleanup.BUILD
deleted file mode 100644
index eec527b171795c..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.cleanup.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "cleanup",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.container.BUILD b/third_party/xla/third_party/absl/system.absl.container.BUILD
deleted file mode 100644
index 95c1626043010f..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.container.BUILD
+++ /dev/null
@@ -1,217 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "compressed_tuple",
-    deps = [
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "fixed_array",
-    deps = [
-        ":compressed_tuple",
-        "//absl/algorithm",
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "inlined_vector_internal",
-    deps = [
-        ":compressed_tuple",
-        "//absl/base:core_headers",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "inlined_vector",
-    deps = [
-        ":inlined_vector_internal",
-        "//absl/algorithm",
-        "//absl/base:core_headers",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "flat_hash_map",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":raw_hash_map",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "flat_hash_set",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":raw_hash_set",
-        "//absl/algorithm:container",
-        "//absl/base:core_headers",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "node_hash_map",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":node_hash_policy",
-        ":raw_hash_map",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "node_hash_set",
-    deps = [
-        ":hash_function_defaults",
-        ":node_hash_policy",
-        ":raw_hash_set",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "container_memory",
-    deps = [
-        "//absl/base:config",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "hash_function_defaults",
-    deps = [
-        "//absl/base:config",
-        "//absl/hash",
-        "//absl/strings",
-        "//absl/strings:cord",
-    ],
-)
-
-cc_library(
-    name = "hash_policy_traits",
-    deps = ["//absl/meta:type_traits"],
-)
-
-cc_library(
-    name = "hashtable_debug",
-    deps = [
-        ":hashtable_debug_hooks",
-    ],
-)
-
-cc_library(
-    name = "hashtable_debug_hooks",
-    deps = [
-        "//absl/base:config",
-    ],
-)
-
-cc_library(
-    name = "hashtablez_sampler",
-    linkopts = ["-labsl_hashtablez_sampler"],
-    deps = [
-        "//absl/base",
-        "//absl/base:core_headers",
-        "//absl/base:exponential_biased",
-        "//absl/debugging:stacktrace",
-        "//absl/memory",
-        "//absl/synchronization",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "node_hash_policy",
-    deps = ["//absl/base:config"],
-)
-
-cc_library(
-    name = "raw_hash_map",
-    deps = [
-        ":container_memory",
-        ":raw_hash_set",
-        "//absl/base:throw_delegate",
-    ],
-)
-
-cc_library(
-    name = "common",
-    deps = [
-        "//absl/meta:type_traits",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "raw_hash_set",
-    linkopts = ["-labsl_raw_hash_set"],
-    deps = [
-        ":common",
-        ":compressed_tuple",
-        ":container_memory",
-        ":hash_policy_traits",
-        ":hashtable_debug_hooks",
-        ":hashtablez_sampler",
-        ":layout",
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/base:endian",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/numeric:bits",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "layout",
-    deps = [
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/types:span",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "btree",
-    deps = [
-        ":common",
-        ":compressed_tuple",
-        ":container_memory",
-        ":layout",
-        "//absl/base:core_headers",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/strings:cord",
-        "//absl/types:compare",
-        "//absl/utility",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.debugging.BUILD b/third_party/xla/third_party/absl/system.absl.debugging.BUILD
deleted file mode 100644
index 931ffdc9e92ee7..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.debugging.BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "stacktrace",
-    linkopts = ["-labsl_stacktrace"],
-    deps = [
-        ":debugging_internal",
-    ],
-)
-
-cc_library(
-    name = "symbolize",
-    linkopts = ["-labsl_symbolize"],
-    deps = [
-        ":debugging_internal",
-        ":demangle_internal",
-        "//absl/base",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "failure_signal_handler",
-    linkopts = [
-        "-labsl_failure_signal_handler",
-        "-labsl_examine_stack",
-    ],
-    deps = [
-        ":stacktrace",
-        ":symbolize",
-        "//absl/base",
-        "//absl/base:errno_saver",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "debugging_internal",
-    linkopts = ["-labsl_debugging_internal"],
-    deps = [
-        "//absl/base:dynamic_annotations",
-        "//absl/base:errno_saver",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "demangle_internal",
-    linkopts = ["-labsl_demangle_internal"],
-    deps = [
-        "//absl/base",
-    ],
-)
-
-cc_library(
-    name = "leak_check",
-    linkopts = ["-labsl_leak_check"],
-)
-
-cc_library(
-    name = "leak_check_disable",
-    linkopts = ["-labsl_leak_check_disable"],
-    alwayslink = 1,
-)
diff --git a/third_party/xla/third_party/absl/system.absl.flags.BUILD b/third_party/xla/third_party/absl/system.absl.flags.BUILD
deleted file mode 100644
index aff653c7e5b1d4..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.flags.BUILD
+++ /dev/null
@@ -1,155 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "program_name",
-    linkopts = ["-labsl_flags_program_name"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "config",
-    linkopts = ["-labsl_flags_config"],
-    deps = [
-        ":program_name",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "marshalling",
-    linkopts = ["-labsl_flags_marshalling"],
-    deps = [
-        "//absl/base:log_severity",
-        "//absl/strings",
-        "//absl/strings:str_format",
-    ],
-)
-
-cc_library(
-    name = "commandlineflag_internal",
-    linkopts = ["-labsl_flags_commandlineflag_internal"],
-)
-
-cc_library(
-    name = "commandlineflag",
-    linkopts = ["-labsl_flags_commandlineflag"],
-    deps = [
-        ":commandlineflag_internal",
-        "//absl/strings",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "private_handle_accessor",
-    linkopts = ["-labsl_flags_private_handle_accessor"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "reflection",
-    linkopts = ["-labsl_flags_reflection"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":private_handle_accessor",
-        "//absl/container:flat_hash_map",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "flag_internal",
-    linkopts = ["-labsl_flags_internal"],
-    visibility = ["//absl/base:__subpackages__"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":marshalling",
-        ":reflection",
-        "//absl/base",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/synchronization",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "flag",
-    linkopts = ["-labsl_flags"],
-    deps = [
-        ":config",
-        ":flag_internal",
-        ":reflection",
-        "//absl/base",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "usage_internal",
-    linkopts = ["-labsl_flags_usage_internal"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        ":commandlineflag",
-        ":config",
-        ":flag",
-        ":flag_internal",
-        ":private_handle_accessor",
-        ":program_name",
-        ":reflection",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "usage",
-    linkopts = ["-labsl_flags_usage"],
-    deps = [
-        ":usage_internal",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "parse",
-    linkopts = ["-labsl_flags_parse"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":flag",
-        ":flag_internal",
-        ":private_handle_accessor",
-        ":program_name",
-        ":reflection",
-        ":usage",
-        ":usage_internal",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.functional.BUILD b/third_party/xla/third_party/absl/system.absl.functional.BUILD
deleted file mode 100644
index a4f70acf35ca79..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.functional.BUILD
+++ /dev/null
@@ -1,11 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "bind_front",
-)
-
-cc_library(
-    name = "function_ref",
-)
diff --git a/third_party/xla/third_party/absl/system.absl.hash.BUILD b/third_party/xla/third_party/absl/system.absl.hash.BUILD
deleted file mode 100644
index 3367340cb25c2c..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.hash.BUILD
+++ /dev/null
@@ -1,37 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "hash",
-    linkopts = ["-labsl_hash"],
-    deps = [
-        ":city",
-        ":low_level_hash",
-        "//absl/base:endian",
-        "//absl/container:fixed_array",
-        "//absl/numeric:int128",
-        "//absl/strings",
-        "//absl/types:optional",
-        "//absl/types:variant",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "city",
-    linkopts = ["-labsl_city"],
-    deps = [
-        "//absl/base:endian",
-    ],
-)
-
-cc_library(
-    name = "low_level_hash",
-    linkopts = ["-labsl_low_level_hash"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//absl/base:endian",
-        "//absl/numeric:int128",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.memory.BUILD b/third_party/xla/third_party/absl/system.absl.memory.BUILD
deleted file mode 100644
index 592c004e90e742..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.memory.BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "memory",
-)
diff --git a/third_party/xla/third_party/absl/system.absl.meta.BUILD b/third_party/xla/third_party/absl/system.absl.meta.BUILD
deleted file mode 100644
index 966a7ac8de14cc..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.meta.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "type_traits",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.numeric.BUILD b/third_party/xla/third_party/absl/system.absl.numeric.BUILD
deleted file mode 100644
index 59a5836ad2dd27..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.numeric.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "bits",
-)
-
-cc_library(
-    name = "int128",
-    linkopts = ["-labsl_int128"],
-)
-
-cc_library(
-    name = "representation",
-)
diff --git a/third_party/xla/third_party/absl/system.absl.random.BUILD b/third_party/xla/third_party/absl/system.absl.random.BUILD
deleted file mode 100644
index 948de07751a2cf..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.random.BUILD
+++ /dev/null
@@ -1,53 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "random",
-    deps = [
-        ":distributions",
-        ":seed_sequences",
-        "//absl/base:endian",
-    ],
-)
-
-cc_library(
-    name = "distributions",
-    linkopts = ["-labsl_random_distributions"],
-    deps = [
-        "//absl/numeric:bits",
-        "//absl/numeric:int128",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "seed_gen_exception",
-    linkopts = ["-labsl_random_seed_gen_exception"],
-)
-
-cc_library(
-    name = "seed_sequences",
-    linkopts = [
-        "-labsl_random_internal_platform",
-        "-labsl_random_internal_pool_urbg",
-        "-labsl_random_internal_randen",
-        "-labsl_random_internal_randen_hwaes",
-        "-labsl_random_internal_randen_hwaes_impl",
-        "-labsl_random_internal_randen_slow",
-        "-labsl_random_internal_seed_material",
-        "-labsl_random_seed_sequences",
-        "-pthread",
-    ],
-    deps = [
-        ":seed_gen_exception",
-        "//absl/base",
-        "//absl/base:endian",
-        "//absl/base:raw_logging_internal",
-        "//absl/container:inlined_vector",
-        "//absl/numeric:int128",
-        "//absl/strings",
-        "//absl/types:optional",
-        "//absl/types:span",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.status.BUILD b/third_party/xla/third_party/absl/system.absl.status.BUILD
deleted file mode 100644
index e50e9790c829dd..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.status.BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "status",
-    linkopts = ["-labsl_status"],
-    deps = [
-        "//absl/base:atomic_hook",
-        "//absl/base:raw_logging_internal",
-        "//absl/container:inlined_vector",
-        "//absl/debugging:stacktrace",
-        "//absl/debugging:symbolize",
-        "//absl/strings",
-        "//absl/strings:cord",
-        "//absl/strings:str_format",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "statusor",
-    linkopts = ["-labsl_statusor"],
-    deps = [
-        ":status",
-        "//absl/base:raw_logging_internal",
-        "//absl/strings",
-        "//absl/types:variant",
-        "//absl/utility",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.strings.BUILD b/third_party/xla/third_party/absl/system.absl.strings.BUILD
deleted file mode 100644
index fa9a7a84f67a1a..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.strings.BUILD
+++ /dev/null
@@ -1,49 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "strings",
-    linkopts = ["-labsl_strings"],
-    deps = [
-        ":internal",
-        "//absl/base",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-        "//absl/numeric:bits",
-        "//absl/numeric:int128",
-    ],
-)
-
-cc_library(
-    name = "internal",
-    linkopts = ["-labsl_strings_internal"],
-    deps = [
-        "//absl/base:endian",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "cord",
-    linkopts = ["-labsl_cord"],
-    deps = [
-        ":str_format",
-        "//absl/container:compressed_tuple",
-        "//absl/container:fixed_array",
-        "//absl/container:inlined_vector",
-        "//absl/container:layout",
-    ],
-)
-
-cc_library(
-    name = "str_format",
-    linkopts = ["-labsl_str_format_internal"],
-    deps = [
-        ":strings",
-        "//absl/functional:function_ref",
-        "//absl/numeric:representation",
-        "//absl/types:optional",
-        "//absl/types:span",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.synchronization.BUILD b/third_party/xla/third_party/absl/system.absl.synchronization.BUILD
deleted file mode 100644
index c0fa37aacd7ed5..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.synchronization.BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-# Internal data structure for efficiently detecting mutex dependency cycles
-cc_library(
-    name = "graphcycles_internal",
-    linkopts = ["-labsl_graphcycles_internal"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        "//absl/base",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "synchronization",
-    linkopts = [
-        "-labsl_synchronization",
-        "-pthread",
-    ],
-    deps = [
-        ":graphcycles_internal",
-        "//absl/base",
-        "//absl/base:atomic_hook",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-        "//absl/debugging:stacktrace",
-        "//absl/debugging:symbolize",
-        "//absl/time",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.time.BUILD b/third_party/xla/third_party/absl/system.absl.time.BUILD
deleted file mode 100644
index fe295c3943b282..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.time.BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "time",
-    linkopts = [
-        "-labsl_time",
-        "-labsl_civil_time",
-        "-labsl_time_zone",
-    ],
-    deps = [
-        "//absl/base",
-        "//absl/base:raw_logging_internal",
-        "//absl/numeric:int128",
-        "//absl/strings",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.types.BUILD b/third_party/xla/third_party/absl/system.absl.types.BUILD
deleted file mode 100644
index db94fc99185c17..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.types.BUILD
+++ /dev/null
@@ -1,59 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "any",
-    deps = [
-        ":bad_any_cast",
-    ],
-)
-
-cc_library(
-    name = "bad_any_cast",
-    linkopts = ["-labsl_bad_any_cast_impl"],
-)
-
-cc_library(
-    name = "span",
-    deps = [
-        "//absl/base:throw_delegate",
-    ],
-)
-
-cc_library(
-    name = "optional",
-    deps = [
-        ":bad_optional_access",
-    ],
-)
-
-cc_library(
-    name = "bad_optional_access",
-    linkopts = ["-labsl_bad_optional_access"],
-    deps = [
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "bad_variant_access",
-    linkopts = ["-labsl_bad_variant_access"],
-    deps = [
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "variant",
-    deps = [
-        ":bad_variant_access",
-    ],
-)
-
-cc_library(
-    name = "compare",
-    deps = [
-        "//absl/meta:type_traits",
-    ],
-)
diff --git a/third_party/xla/third_party/absl/system.absl.utility.BUILD b/third_party/xla/third_party/absl/system.absl.utility.BUILD
deleted file mode 100644
index e15049e261c7b9..00000000000000
--- a/third_party/xla/third_party/absl/system.absl.utility.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "utility",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/absl/workspace.bzl b/third_party/xla/third_party/absl/workspace.bzl
deleted file mode 100644
index 07f49cebb78cac..00000000000000
--- a/third_party/xla/third_party/absl/workspace.bzl
+++ /dev/null
@@ -1,50 +0,0 @@
-"""Provides the repository macro to import absl."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports absl."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    ABSL_COMMIT = "b971ac5250ea8de900eae9f95e06548d14cd95fe"
-    ABSL_SHA256 = "8eeec9382fc0338ef5c60053f3a4b0e0708361375fe51c9e65d0ce46ccfe55a7"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake)
-
-    SYS_DIRS = [
-        "algorithm",
-        "base",
-        "cleanup",
-        "container",
-        "debugging",
-        "flags",
-        "functional",
-        "hash",
-        "memory",
-        "meta",
-        "numeric",
-        "random",
-        "status",
-        "strings",
-        "synchronization",
-        "time",
-        "types",
-        "utility",
-    ]
-    SYS_LINKS = {
-        "//third_party/absl:system.absl.{name}.BUILD".format(name = n): "absl/{name}/BUILD.bazel".format(name = n)
-        for n in SYS_DIRS
-    }
-
-    tf_http_archive(
-        name = "com_google_absl",
-        sha256 = ABSL_SHA256,
-        build_file = "//third_party/absl:com_google_absl.BUILD",
-        system_build_file = "//third_party/absl:system.BUILD",
-        system_link_files = SYS_LINKS,
-        # This patch pulls in a fix for designated initializers that MSVC
-        # complains about. It shouldn't be necessary at the next LTS release.
-        patch_file = ["//third_party/absl:absl_designated_initializers.patch"],
-        strip_prefix = "abseil-cpp-{commit}".format(commit = ABSL_COMMIT),
-        urls = tf_mirror_urls("https://github.com/abseil/abseil-cpp/archive/{commit}.tar.gz".format(commit = ABSL_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/benchmark/BUILD b/third_party/xla/third_party/benchmark/BUILD
deleted file mode 100644
index 3c413807167aeb..00000000000000
--- a/third_party/xla/third_party/benchmark/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/benchmark/workspace.bzl b/third_party/xla/third_party/benchmark/workspace.bzl
deleted file mode 100644
index 679133c60c0ffa..00000000000000
--- a/third_party/xla/third_party/benchmark/workspace.bzl
+++ /dev/null
@@ -1,14 +0,0 @@
-"""Provides the repo macro to import google benchmark"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports benchmark."""
-    BM_COMMIT = "f7547e29ccaed7b64ef4f7495ecfff1c9f6f3d03"
-    BM_SHA256 = "552ca3d4d1af4beeb1907980f7096315aa24150d6baf5ac1e5ad90f04846c670"
-    tf_http_archive(
-        name = "com_google_benchmark",
-        sha256 = BM_SHA256,
-        strip_prefix = "benchmark-{commit}".format(commit = BM_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/benchmark/archive/{commit}.tar.gz".format(commit = BM_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/clang_toolchain/BUILD b/third_party/xla/third_party/clang_toolchain/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/clang_toolchain/cc_configure_clang.bzl b/third_party/xla/third_party/clang_toolchain/cc_configure_clang.bzl
deleted file mode 100644
index a6b87ab69716cf..00000000000000
--- a/third_party/xla/third_party/clang_toolchain/cc_configure_clang.bzl
+++ /dev/null
@@ -1,27 +0,0 @@
-""" Downloads clang and configures the crosstool using bazel's autoconf."""
-
-load("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_autoconf_impl")
-load(":download_clang.bzl", "download_clang")
-
-_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
-_TF_NEED_CUDA = "TF_NEED_CUDA"
-
-def _cc_clang_autoconf(repo_ctx):
-    if repo_ctx.os.environ.get(_TF_DOWNLOAD_CLANG) != "1":
-        return
-    if repo_ctx.os.environ.get(_TF_NEED_CUDA) == "1":
-        # Clang is handled separately for CUDA configs.
-        # See cuda_configure.bzl for more details.
-        return
-
-    download_clang(repo_ctx, out_folder = "extra_tools")
-    overridden_tools = {"gcc": "extra_tools/bin/clang"}
-    cc_autoconf_impl(repo_ctx, overridden_tools)
-
-cc_download_clang_toolchain = repository_rule(
-    environ = [
-        _TF_DOWNLOAD_CLANG,
-        _TF_NEED_CUDA,
-    ],
-    implementation = _cc_clang_autoconf,
-)
diff --git a/third_party/xla/third_party/clang_toolchain/download_clang.bzl b/third_party/xla/third_party/clang_toolchain/download_clang.bzl
deleted file mode 100644
index 6e6091bf657acb..00000000000000
--- a/third_party/xla/third_party/clang_toolchain/download_clang.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-""" Helpers to download a recent clang release."""
-
-def _get_platform_folder(os_name):
-    os_name = os_name.lower()
-    if os_name.startswith("windows"):
-        return "Win"
-    if os_name.startswith("mac os"):
-        return "Mac"
-    if not os_name.startswith("linux"):
-        fail("Unknown platform")
-    return "Linux_x64"
-
-def _download_chromium_clang(
-        repo_ctx,
-        platform_folder,
-        package_version,
-        sha256,
-        out_folder):
-    cds_url = "https://commondatastorage.googleapis.com/chromium-browser-clang"
-    cds_file = "clang-%s.tgz" % package_version
-    cds_full_url = "{0}/{1}/{2}".format(cds_url, platform_folder, cds_file)
-    repo_ctx.download_and_extract(cds_full_url, output = out_folder, sha256 = sha256)
-
-def download_clang(repo_ctx, out_folder):
-    """ Download a fresh clang release and put it into out_folder.
-
-    Clang itself will be located in 'out_folder/bin/clang'.
-    We currently download one of the latest releases of clang by the
-    Chromium project (see
-    https://chromium.googlesource.com/chromium/src/+/master/docs/clang.md).
-
-    Args:
-      repo_ctx: An instance of repository_context object.
-      out_folder: A folder to extract the compiler into.
-    """
-    # TODO(ibiryukov): we currently download and extract some extra tools in the
-    # clang release (e.g., sanitizers). We should probably remove the ones
-    # we don't need and document the ones we want provide in addition to clang.
-
-    # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
-    # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "b4160cb94c54f0b31d0ce14694950dac7b6cd83f"
-    CLANG_SVN_REVISION = "371856"
-    CLANG_SUB_REVISION = 1
-    package_version = "%s-%s-%s" % (
-        CLANG_SVN_REVISION,
-        CLANG_REVISION[:8],
-        CLANG_SUB_REVISION,
-    )
-
-    checksums = {
-        "Linux_x64": "919c19df3ebd7db03b72575b2de5198404357659fc8c85c2d66e679ad4acbafe",
-        "Mac": "5632c516f3ac5fab3654d0a874688cad6c7f99b96845da27ab12336a14187aa2",
-        "Win": "235545b33f4d697190032cb538fdcaba227017c95b752ea8af8f29aab8da7479",
-    }
-
-    platform_folder = _get_platform_folder(repo_ctx.os.name)
-    _download_chromium_clang(
-        repo_ctx,
-        platform_folder,
-        package_version,
-        checksums[platform_folder],
-        out_folder,
-    )
diff --git a/third_party/xla/third_party/curl.BUILD b/third_party/xla/third_party/curl.BUILD
deleted file mode 100644
index 8dcd54451dae66..00000000000000
--- a/third_party/xla/third_party/curl.BUILD
+++ /dev/null
@@ -1,822 +0,0 @@
-# Description:
-#   curl is a tool for talking to web servers.
-
-licenses(["notice"])  # MIT/X derivative license
-
-exports_files(["COPYING"])
-
-CURL_WIN_COPTS = [
-    "/Iexternal/curl/lib",
-    "/DBUILDING_LIBCURL",
-    "/DHAVE_CONFIG_H",
-    "/DCURL_DISABLE_FTP",
-    "/DCURL_DISABLE_NTLM",
-    "/DCURL_DISABLE_PROXY",
-    "/DHAVE_LIBZ",
-    "/DHAVE_ZLIB_H",
-    # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
-    # detection of what OS releases we can build on with VC 2012. This
-    # may not be needed (or may have to change) if the WINVER setting
-    # changes in //third_party/msvc/vc_12_0/CROSSTOOL.
-    "/D_USING_V110_SDK71_",
-]
-
-CURL_WIN_SRCS = [
-    "lib/asyn-thread.c",
-    "lib/inet_ntop.c",
-    "lib/system_win32.c",
-    "lib/setup-win32.h",
-]
-
-cc_library(
-    name = "curl",
-    srcs = [
-        "include/curl_config.h",
-        "lib/altsvc.c",
-        "lib/altsvc.h",
-        "lib/amigaos.c",
-        "lib/amigaos.h",
-        "lib/arpa_telnet.h",
-        "lib/asyn.h",
-        "lib/asyn-ares.c",
-        "lib/base64.c",
-        "lib/bufq.c",
-        "lib/bufq.h",
-        "lib/bufref.c",
-        "lib/bufref.h",
-        "lib/c-hyper.c",
-        "lib/c-hyper.h",
-        "lib/cf-h1-proxy.c",
-        "lib/cf-h1-proxy.h",
-        "lib/cf-h2-proxy.c",
-        "lib/cf-h2-proxy.h",
-        "lib/cf-haproxy.c",
-        "lib/cf-haproxy.h",
-        "lib/cf-https-connect.c",
-        "lib/cf-https-connect.h",
-        "lib/cf-socket.c",
-        "lib/cf-socket.h",
-        "lib/cfilters.c",
-        "lib/cfilters.h",
-        "lib/config-amigaos.h",
-        "lib/config-dos.h",
-        "lib/config-mac.h",
-        "lib/config-os400.h",
-        "lib/config-plan9.h",
-        "lib/config-riscos.h",
-        "lib/config-win32.h",
-        "lib/config-win32ce.h",
-        "lib/conncache.c",
-        "lib/conncache.h",
-        "lib/connect.c",
-        "lib/connect.h",
-        "lib/content_encoding.c",
-        "lib/content_encoding.h",
-        "lib/cookie.c",
-        "lib/cookie.h",
-        "lib/curl_addrinfo.c",
-        "lib/curl_addrinfo.h",
-        "lib/curl_base64.h",
-        "lib/curl_ctype.h",
-        "lib/curl_des.c",
-        "lib/curl_des.h",
-        "lib/curl_endian.c",
-        "lib/curl_endian.h",
-        "lib/curl_fnmatch.c",
-        "lib/curl_fnmatch.h",
-        "lib/curl_get_line.c",
-        "lib/curl_get_line.h",
-        "lib/curl_gethostname.c",
-        "lib/curl_gethostname.h",
-        "lib/curl_gssapi.c",
-        "lib/curl_gssapi.h",
-        "lib/curl_hmac.h",
-        "lib/curl_krb5.h",
-        "lib/curl_ldap.h",
-        "lib/curl_md4.h",
-        "lib/curl_md5.h",
-        "lib/curl_memory.h",
-        "lib/curl_memrchr.c",
-        "lib/curl_memrchr.h",
-        "lib/curl_multibyte.c",
-        "lib/curl_multibyte.h",
-        "lib/curl_ntlm_core.c",
-        "lib/curl_ntlm_core.h",
-        "lib/curl_ntlm_wb.c",
-        "lib/curl_ntlm_wb.h",
-        "lib/curl_path.c",
-        "lib/curl_path.h",
-        "lib/curl_printf.h",
-        "lib/curl_range.c",
-        "lib/curl_range.h",
-        "lib/curl_rtmp.c",
-        "lib/curl_rtmp.h",
-        "lib/curl_sasl.c",
-        "lib/curl_sasl.h",
-        "lib/curl_setup.h",
-        "lib/curl_setup_once.h",
-        "lib/curl_sha256.h",
-        "lib/curl_sspi.c",
-        "lib/curl_sspi.h",
-        "lib/curl_threads.c",
-        "lib/curl_threads.h",
-        "lib/curl_trc.c",
-        "lib/curl_trc.h",
-        "lib/curlx.h",
-        "lib/dict.c",
-        "lib/dict.h",
-        "lib/doh.c",
-        "lib/doh.h",
-        "lib/dynbuf.c",
-        "lib/dynbuf.h",
-        "lib/dynhds.c",
-        "lib/dynhds.h",
-        "lib/easy.c",
-        "lib/easy_lock.h",
-        "lib/easygetopt.c",
-        "lib/easyif.h",
-        "lib/easyoptions.c",
-        "lib/easyoptions.h",
-        "lib/escape.c",
-        "lib/escape.h",
-        "lib/file.c",
-        "lib/file.h",
-        "lib/fileinfo.c",
-        "lib/fileinfo.h",
-        "lib/fopen.c",
-        "lib/fopen.h",
-        "lib/formdata.c",
-        "lib/formdata.h",
-        "lib/ftp.c",
-        "lib/ftp.h",
-        "lib/ftplistparser.c",
-        "lib/ftplistparser.h",
-        "lib/functypes.h",
-        "lib/getenv.c",
-        "lib/getinfo.c",
-        "lib/getinfo.h",
-        "lib/gopher.c",
-        "lib/gopher.h",
-        "lib/hash.c",
-        "lib/hash.h",
-        "lib/headers.c",
-        "lib/headers.h",
-        "lib/hmac.c",
-        "lib/hostasyn.c",
-        "lib/hostip.c",
-        "lib/hostip.h",
-        "lib/hostip4.c",
-        "lib/hostip6.c",
-        "lib/hostsyn.c",
-        "lib/hsts.c",
-        "lib/hsts.h",
-        "lib/http.c",
-        "lib/http.h",
-        "lib/http1.c",
-        "lib/http1.h",
-        "lib/http2.c",
-        "lib/http2.h",
-        "lib/http_aws_sigv4.c",
-        "lib/http_aws_sigv4.h",
-        "lib/http_chunks.c",
-        "lib/http_chunks.h",
-        "lib/http_digest.c",
-        "lib/http_digest.h",
-        "lib/http_negotiate.c",
-        "lib/http_negotiate.h",
-        "lib/http_ntlm.c",
-        "lib/http_ntlm.h",
-        "lib/http_proxy.c",
-        "lib/http_proxy.h",
-        "lib/idn.c",
-        "lib/idn.h",
-        "lib/if2ip.c",
-        "lib/if2ip.h",
-        "lib/imap.c",
-        "lib/imap.h",
-        "lib/inet_ntop.h",
-        "lib/inet_pton.c",
-        "lib/inet_pton.h",
-        "lib/krb5.c",
-        "lib/ldap.c",
-        "lib/llist.c",
-        "lib/llist.h",
-        "lib/macos.c",
-        "lib/macos.h",
-        "lib/md4.c",
-        "lib/md5.c",
-        "lib/memdebug.c",
-        "lib/memdebug.h",
-        "lib/mime.c",
-        "lib/mime.h",
-        "lib/mprintf.c",
-        "lib/mqtt.c",
-        "lib/mqtt.h",
-        "lib/multi.c",
-        "lib/multihandle.h",
-        "lib/multiif.h",
-        "lib/netrc.c",
-        "lib/netrc.h",
-        "lib/nonblock.c",
-        "lib/nonblock.h",
-        "lib/noproxy.c",
-        "lib/noproxy.h",
-        "lib/openldap.c",
-        "lib/parsedate.c",
-        "lib/parsedate.h",
-        "lib/pingpong.c",
-        "lib/pingpong.h",
-        "lib/pop3.c",
-        "lib/pop3.h",
-        "lib/progress.c",
-        "lib/progress.h",
-        "lib/psl.c",
-        "lib/psl.h",
-        "lib/rand.c",
-        "lib/rand.h",
-        "lib/rename.c",
-        "lib/rename.h",
-        "lib/rtsp.c",
-        "lib/rtsp.h",
-        "lib/select.c",
-        "lib/select.h",
-        "lib/sendf.c",
-        "lib/sendf.h",
-        "lib/setopt.c",
-        "lib/setopt.h",
-        "lib/setup-os400.h",
-        "lib/setup-vms.h",
-        "lib/sha256.c",
-        "lib/share.c",
-        "lib/share.h",
-        "lib/sigpipe.h",
-        "lib/slist.c",
-        "lib/slist.h",
-        "lib/smb.c",
-        "lib/smb.h",
-        "lib/smtp.c",
-        "lib/smtp.h",
-        "lib/sockaddr.h",
-        "lib/socketpair.c",
-        "lib/socketpair.h",
-        "lib/socks.c",
-        "lib/socks.h",
-        "lib/socks_gssapi.c",
-        "lib/socks_sspi.c",
-        "lib/speedcheck.c",
-        "lib/speedcheck.h",
-        "lib/splay.c",
-        "lib/splay.h",
-        "lib/strcase.c",
-        "lib/strcase.h",
-        "lib/strdup.c",
-        "lib/strdup.h",
-        "lib/strerror.c",
-        "lib/strerror.h",
-        "lib/strtok.c",
-        "lib/strtok.h",
-        "lib/strtoofft.c",
-        "lib/strtoofft.h",
-        "lib/system_win32.h",
-        "lib/telnet.c",
-        "lib/telnet.h",
-        "lib/tftp.c",
-        "lib/tftp.h",
-        "lib/timediff.c",
-        "lib/timediff.h",
-        "lib/timeval.c",
-        "lib/timeval.h",
-        "lib/transfer.c",
-        "lib/transfer.h",
-        "lib/url.c",
-        "lib/url.h",
-        "lib/urlapi.c",
-        "lib/urlapi-int.h",
-        "lib/urldata.h",
-        "lib/vauth/cleartext.c",
-        "lib/vauth/cram.c",
-        "lib/vauth/digest.c",
-        "lib/vauth/digest.h",
-        "lib/vauth/digest_sspi.c",
-        "lib/vauth/gsasl.c",
-        "lib/vauth/krb5_gssapi.c",
-        "lib/vauth/krb5_sspi.c",
-        "lib/vauth/ntlm.c",
-        "lib/vauth/ntlm.h",
-        "lib/vauth/ntlm_sspi.c",
-        "lib/vauth/oauth2.c",
-        "lib/vauth/spnego_gssapi.c",
-        "lib/vauth/spnego_sspi.c",
-        "lib/vauth/vauth.c",
-        "lib/vauth/vauth.h",
-        "lib/version.c",
-        "lib/version_win32.c",
-        "lib/version_win32.h",
-        "lib/vquic/curl_msh3.c",
-        "lib/vquic/curl_msh3.h",
-        "lib/vquic/curl_ngtcp2.c",
-        "lib/vquic/curl_ngtcp2.h",
-        "lib/vquic/curl_quiche.c",
-        "lib/vquic/curl_quiche.h",
-        "lib/vquic/vquic.c",
-        "lib/vquic/vquic.h",
-        "lib/vquic/vquic_int.h",
-        "lib/vssh/libssh.c",
-        "lib/vssh/libssh2.c",
-        "lib/vssh/ssh.h",
-        "lib/vssh/wolfssh.c",
-        "lib/vtls/bearssl.c",
-        "lib/vtls/bearssl.h",
-        "lib/vtls/gtls.c",
-        "lib/vtls/gtls.h",
-        "lib/vtls/hostcheck.c",
-        "lib/vtls/hostcheck.h",
-        "lib/vtls/keylog.c",
-        "lib/vtls/keylog.h",
-        "lib/vtls/mbedtls.c",
-        "lib/vtls/mbedtls.h",
-        "lib/vtls/mbedtls_threadlock.c",
-        "lib/vtls/mbedtls_threadlock.h",
-        "lib/vtls/openssl.c",
-        "lib/vtls/openssl.h",
-        "lib/vtls/rustls.c",
-        "lib/vtls/rustls.h",
-        "lib/vtls/schannel.c",
-        "lib/vtls/schannel.h",
-        "lib/vtls/schannel_int.h",
-        "lib/vtls/schannel_verify.c",
-        "lib/vtls/sectransp.h",
-        "lib/vtls/vtls.c",
-        "lib/vtls/vtls.h",
-        "lib/vtls/vtls_int.h",
-        "lib/vtls/wolfssl.c",
-        "lib/vtls/wolfssl.h",
-        "lib/vtls/x509asn1.c",
-        "lib/vtls/x509asn1.h",
-        "lib/warnless.c",
-        "lib/warnless.h",
-        "lib/ws.c",
-        "lib/ws.h",
-    ] + select({
-        "@local_tsl//tsl:macos": [
-            "lib/vtls/sectransp.c",
-        ],
-        "@local_tsl//tsl:ios": [
-            "lib/vtls/sectransp.c",
-        ],
-        "@local_tsl//tsl:windows": CURL_WIN_SRCS,
-        "//conditions:default": [
-        ],
-    }),
-    hdrs = [
-        "include/curl/curl.h",
-        "include/curl/curlver.h",
-        "include/curl/easy.h",
-        "include/curl/header.h",
-        "include/curl/mprintf.h",
-        "include/curl/multi.h",
-        "include/curl/options.h",
-        "include/curl/stdcheaders.h",
-        "include/curl/system.h",
-        "include/curl/typecheck-gcc.h",
-        "include/curl/urlapi.h",
-        "include/curl/websockets.h",
-    ],
-    copts = select({
-        "@local_tsl//tsl:windows": CURL_WIN_COPTS,
-        "//conditions:default": [
-            "-Iexternal/curl/lib",
-            "-D_GNU_SOURCE",
-            "-DBUILDING_LIBCURL",
-            "-DHAVE_CONFIG_H",
-            "-DCURL_DISABLE_FTP",
-            "-DCURL_DISABLE_NTLM",  # turning it off in configure is not enough
-            "-DHAVE_LIBZ",
-            "-DHAVE_ZLIB_H",
-            "-Wno-string-plus-int",
-        ],
-    }) + select({
-        "@local_tsl//tsl:macos": [
-            "-fno-constant-cfstrings",
-        ],
-        "@local_tsl//tsl:windows": [
-            # See curl.h for discussion of write size and Windows
-            "/DCURL_MAX_WRITE_SIZE=16384",
-        ],
-        "//conditions:default": [
-            "-DCURL_MAX_WRITE_SIZE=65536",
-        ],
-    }),
-    defines = ["CURL_STATICLIB"],
-    includes = ["include"],
-    linkopts = select({
-        "@local_tsl//tsl:android": [
-            "-pie",
-        ],
-        "@local_tsl//tsl:macos": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,SystemConfiguration",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [
-            "-DEFAULTLIB:ws2_32.lib",
-            "-DEFAULTLIB:advapi32.lib",
-            "-DEFAULTLIB:crypt32.lib",
-            "-DEFAULTLIB:Normaliz.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@zlib",
-    ] + select({
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [],
-        "//conditions:default": [
-            "@boringssl//:ssl",
-        ],
-    }),
-)
-
-CURL_BIN_WIN_COPTS = [
-    "/Iexternal/curl/lib",
-    "/DHAVE_CONFIG_H",
-    "/DCURL_DISABLE_LIBCURL_OPTION",
-]
-
-cc_binary(
-    name = "curl_bin",
-    srcs = [
-        "lib/config-win32.h",
-        "src/slist_wc.c",
-        "src/slist_wc.h",
-        "src/tool_binmode.c",
-        "src/tool_binmode.h",
-        "src/tool_bname.c",
-        "src/tool_bname.h",
-        "src/tool_cb_dbg.c",
-        "src/tool_cb_dbg.h",
-        "src/tool_cb_hdr.c",
-        "src/tool_cb_hdr.h",
-        "src/tool_cb_prg.c",
-        "src/tool_cb_prg.h",
-        "src/tool_cb_rea.c",
-        "src/tool_cb_rea.h",
-        "src/tool_cb_see.c",
-        "src/tool_cb_see.h",
-        "src/tool_cb_wrt.c",
-        "src/tool_cb_wrt.h",
-        "src/tool_cfgable.c",
-        "src/tool_cfgable.h",
-        "src/tool_dirhie.c",
-        "src/tool_dirhie.h",
-        "src/tool_doswin.c",
-        "src/tool_doswin.h",
-        "src/tool_easysrc.c",
-        "src/tool_easysrc.h",
-        "src/tool_filetime.c",
-        "src/tool_filetime.h",
-        "src/tool_formparse.c",
-        "src/tool_formparse.h",
-        "src/tool_getparam.c",
-        "src/tool_getparam.h",
-        "src/tool_getpass.c",
-        "src/tool_getpass.h",
-        "src/tool_help.c",
-        "src/tool_help.h",
-        "src/tool_helpers.c",
-        "src/tool_helpers.h",
-        "src/tool_homedir.c",
-        "src/tool_homedir.h",
-        "src/tool_hugehelp.c",
-        "src/tool_hugehelp.h",
-        "src/tool_libinfo.c",
-        "src/tool_libinfo.h",
-        "src/tool_main.c",
-        "src/tool_main.h",
-        "src/tool_metalink.c",
-        "src/tool_metalink.h",
-        "src/tool_mfiles.c",
-        "src/tool_mfiles.h",
-        "src/tool_msgs.c",
-        "src/tool_msgs.h",
-        "src/tool_operate.c",
-        "src/tool_operate.h",
-        "src/tool_operhlp.c",
-        "src/tool_operhlp.h",
-        "src/tool_panykey.c",
-        "src/tool_panykey.h",
-        "src/tool_paramhlp.c",
-        "src/tool_paramhlp.h",
-        "src/tool_parsecfg.c",
-        "src/tool_parsecfg.h",
-        "src/tool_progress.c",
-        "src/tool_progress.h",
-        "src/tool_sdecls.h",
-        "src/tool_setopt.c",
-        "src/tool_setopt.h",
-        "src/tool_setup.h",
-        "src/tool_sleep.c",
-        "src/tool_sleep.h",
-        "src/tool_strdup.c",
-        "src/tool_strdup.h",
-        "src/tool_urlglob.c",
-        "src/tool_urlglob.h",
-        "src/tool_util.c",
-        "src/tool_util.h",
-        "src/tool_version.h",
-        "src/tool_vms.c",
-        "src/tool_vms.h",
-        "src/tool_writeenv.c",
-        "src/tool_writeenv.h",
-        "src/tool_writeout.c",
-        "src/tool_writeout.h",
-        "src/tool_writeout_json.c",
-        "src/tool_writeout_json.h",
-        "src/tool_xattr.c",
-        "src/tool_xattr.h",
-    ],
-    copts = select({
-        "@local_tsl//tsl:windows": CURL_BIN_WIN_COPTS,
-        "//conditions:default": [
-            "-Iexternal/curl/lib",
-            "-D_GNU_SOURCE",
-            "-DHAVE_CONFIG_H",
-            "-DCURL_DISABLE_LIBCURL_OPTION",
-            "-Wno-string-plus-int",
-        ],
-    }),
-    deps = [":curl"],
-)
-
-genrule(
-    name = "configure",
-    outs = ["include/curl_config.h"],
-    cmd = "\n".join([
-        "cat <<'EOF' >$@",
-        "#ifndef EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "#define EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "",
-        "#if !defined(_WIN32) && !defined(__APPLE__)",
-        "#  include <openssl/opensslv.h>",
-        "#  if defined(OPENSSL_IS_BORINGSSL)",
-        "#    define HAVE_BORINGSSL 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#if defined(_WIN32)",
-        "#  include \"lib/config-win32.h\"",
-        "#  define BUILDING_LIBCURL 1",
-        "#  define CURL_DISABLE_CRYPTO_AUTH 1",
-        "#  define CURL_DISABLE_DICT 1",
-        "#  define CURL_DISABLE_FILE 1",
-        "#  define CURL_DISABLE_GOPHER 1",
-        "#  define CURL_DISABLE_IMAP 1",
-        "#  define CURL_DISABLE_LDAP 1",
-        "#  define CURL_DISABLE_LDAPS 1",
-        "#  define CURL_DISABLE_POP3 1",
-        "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define CURL_DISABLE_SMTP 1",
-        "#  define CURL_DISABLE_TELNET 1",
-        "#  define CURL_DISABLE_TFTP 1",
-        "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define USE_WINDOWS_SSPI 1",
-        "#  define USE_WIN32_IDN 1",
-        "#  define USE_SCHANNEL 1",
-        "#  define WANT_IDN_PROTOTYPES 1",
-        "#elif defined(__APPLE__)",
-        "#  define HAVE_FSETXATTR_6 1",
-        "#  define HAVE_SETMODE 1",
-        "#  define HAVE_SYS_FILIO_H 1",
-        "#  define HAVE_SYS_SOCKIO_H 1",
-        "#  define OS \"x86_64-apple-darwin15.5.0\"",
-        "#  define USE_SECTRANSP 1",
-        "#else",
-        "#  define CURL_CA_BUNDLE \"/etc/ssl/certs/ca-certificates.crt\"",
-        "#  define GETSERVBYPORT_R_ARGS 6",
-        "#  define GETSERVBYPORT_R_BUFSIZE 4096",
-        "#  define HAVE_BORINGSSL 1",
-        "#  define HAVE_CLOCK_GETTIME_MONOTONIC 1",
-        "#  define HAVE_CRYPTO_CLEANUP_ALL_EX_DATA 1",
-        "#  define HAVE_FSETXATTR_5 1",
-        "#  define HAVE_GETHOSTBYADDR_R 1",
-        "#  define HAVE_GETHOSTBYADDR_R_8 1",
-        "#  define HAVE_GETHOSTBYNAME_R 1",
-        "#  define HAVE_GETHOSTBYNAME_R_6 1",
-        "#  define HAVE_GETSERVBYPORT_R 1",
-        "#  define HAVE_LIBSSL 1",
-        "#  define HAVE_MALLOC_H 1",
-        "#  define HAVE_MSG_NOSIGNAL 1",
-        "#  define HAVE_OPENSSL_CRYPTO_H 1",
-        "#  define HAVE_OPENSSL_ERR_H 1",
-        "#  define HAVE_OPENSSL_PEM_H 1",
-        "#  define HAVE_OPENSSL_PKCS12_H 1",
-        "#  define HAVE_OPENSSL_RSA_H 1",
-        "#  define HAVE_OPENSSL_SSL_H 1",
-        "#  define HAVE_OPENSSL_X509_H 1",
-        "#  define HAVE_RAND_EGD 1",
-        "#  define HAVE_RAND_STATUS 1",
-        "#  define HAVE_SSL_GET_SHUTDOWN 1",
-        "#  define HAVE_TERMIOS_H 1",
-        "#  define OS \"x86_64-pc-linux-gnu\"",
-        "#  define RANDOM_FILE \"/dev/urandom\"",
-        "#  define USE_OPENSSL 1",
-        "#endif",
-        "",
-        "#if !defined(_WIN32)",
-        "#  define CURL_DISABLE_DICT 1",
-        "#  define CURL_DISABLE_FILE 1",
-        "#  define CURL_DISABLE_GOPHER 1",
-        "#  define CURL_DISABLE_IMAP 1",
-        "#  define CURL_DISABLE_LDAP 1",
-        "#  define CURL_DISABLE_LDAPS 1",
-        "#  define CURL_DISABLE_POP3 1",
-        "#  define CURL_DISABLE_SMTP 1",
-        "#  define CURL_DISABLE_TELNET 1",
-        "#  define CURL_DISABLE_TFTP 1",
-        "#  define CURL_EXTERN_SYMBOL __attribute__ ((__visibility__ (\"default\")))",
-        "#  define ENABLE_IPV6 1",
-        "#  define GETHOSTNAME_TYPE_ARG2 size_t",
-        "#  define GETNAMEINFO_QUAL_ARG1 const",
-        "#  define GETNAMEINFO_TYPE_ARG1 struct sockaddr *",
-        "#  define GETNAMEINFO_TYPE_ARG2 socklen_t",
-        "#  define GETNAMEINFO_TYPE_ARG46 socklen_t",
-        "#  define GETNAMEINFO_TYPE_ARG7 int",
-        "#  define HAVE_ALARM 1",
-        "#  define HAVE_ALLOCA_H 1",
-        "#  define HAVE_ARPA_INET_H 1",
-        "#  define HAVE_ARPA_TFTP_H 1",
-        "#  define HAVE_ASSERT_H 1",
-        "#  define HAVE_BASENAME 1",
-        "#  define HAVE_BOOL_T 1",
-        "#  define HAVE_CONNECT 1",
-        "#  define HAVE_DLFCN_H 1",
-        "#  define HAVE_ERRNO_H 1",
-        "#  define HAVE_FCNTL 1",
-        "#  define HAVE_FCNTL_H 1",
-        "#  define HAVE_FCNTL_O_NONBLOCK 1",
-        "#  define HAVE_FDOPEN 1",
-        "#  define HAVE_FORK 1",
-        "#  define HAVE_FREEADDRINFO 1",
-        "#  define HAVE_FREEIFADDRS 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_FSETXATTR 1",
-        "#  endif",
-        "#  define HAVE_FTRUNCATE 1",
-        "#  define HAVE_GAI_STRERROR 1",
-        "#  define HAVE_GETADDRINFO 1",
-        "#  define HAVE_GETADDRINFO_THREADSAFE 1",
-        "#  define HAVE_GETEUID 1",
-        "#  define HAVE_GETHOSTBYADDR 1",
-        "#  define HAVE_GETHOSTBYNAME 1",
-        "#  define HAVE_GETHOSTNAME 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_GETIFADDRS 1",
-        "#  endif",
-        "#  define HAVE_GETNAMEINFO 1",
-        "#  define HAVE_GETPPID 1",
-        "#  define HAVE_GETPROTOBYNAME 1",
-        "#  define HAVE_GETPWUID 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_GETPWUID_R 1",
-        "#  endif",
-        "#  define HAVE_GETRLIMIT 1",
-        "#  define HAVE_GETTIMEOFDAY 1",
-        "#  define HAVE_GMTIME_R 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_IFADDRS_H 1",
-        "#  endif",
-        "#  define HAVE_IF_NAMETOINDEX 1",
-        "#  define HAVE_INET_ADDR 1",
-        "#  define HAVE_INET_NTOP 1",
-        "#  define HAVE_INET_PTON 1",
-        "#  define HAVE_INTTYPES_H 1",
-        "#  define HAVE_IOCTL 1",
-        "#  define HAVE_IOCTL_FIONBIO 1",
-        "#  define HAVE_IOCTL_SIOCGIFADDR 1",
-        "#  define HAVE_LIBGEN_H 1",
-        "#  define HAVE_LIBZ 1",
-        "#  define HAVE_LIMITS_H 1",
-        "#  define HAVE_LL 1",
-        "#  define HAVE_LOCALE_H 1",
-        "#  define HAVE_LOCALTIME_R 1",
-        "#  define HAVE_LONGLONG 1",
-        "#  define HAVE_MEMORY_H 1",
-        "#  define HAVE_NETDB_H 1",
-        "#  define HAVE_NETINET_IN_H 1",
-        "#  define HAVE_NETINET_TCP_H 1",
-        "#  define HAVE_NET_IF_H 1",
-        "#  define HAVE_PERROR 1",
-        "#  define HAVE_PIPE 1",
-        "#  define HAVE_POLL 1",
-        "#  define HAVE_POLL_FINE 1",
-        "#  define HAVE_POLL_H 1",
-        "#  define HAVE_POSIX_STRERROR_R 1",
-        "#  define HAVE_PWD_H 1",
-        "#  define HAVE_RECV 1",
-        "#  define HAVE_SELECT 1",
-        "#  define HAVE_SEND 1",
-        "#  define HAVE_SETJMP_H 1",
-        "#  define HAVE_SETLOCALE 1",
-        "#  define HAVE_SETRLIMIT 1",
-        "#  define HAVE_SETSOCKOPT 1",
-        "#  define HAVE_SGTTY_H 1",
-        "#  define HAVE_SIGACTION 1",
-        "#  define HAVE_SIGINTERRUPT 1",
-        "#  define HAVE_SIGNAL 1",
-        "#  define HAVE_SIGNAL_H 1",
-        "#  define HAVE_SIGSETJMP 1",
-        "#  define HAVE_SIG_ATOMIC_T 1",
-        "#  define HAVE_SOCKADDR_IN6_SIN6_SCOPE_ID 1",
-        "#  define HAVE_SOCKET 1",
-        "#  define HAVE_SOCKETPAIR 1",
-        "#  define HAVE_STDBOOL_H 1",
-        "#  define HAVE_STDINT_H 1",
-        "#  define HAVE_STDIO_H 1",
-        "#  define HAVE_STDLIB_H 1",
-        "#  define HAVE_STRCASECMP 1",
-        "#  define HAVE_STRDUP 1",
-        "#  define HAVE_STRERROR_R 1",
-        "#  define HAVE_STRINGS_H 1",
-        "#  define HAVE_STRING_H 1",
-        "#  define HAVE_STRNCASECMP 1",
-        "#  define HAVE_STRSTR 1",
-        "#  define HAVE_STRTOK_R 1",
-        "#  define HAVE_STRTOLL 1",
-        "#  define HAVE_STRUCT_SOCKADDR_STORAGE 1",
-        "#  define HAVE_STRUCT_TIMEVAL 1",
-        "#  define HAVE_SYS_IOCTL_H 1",
-        "#  define HAVE_SYS_PARAM_H 1",
-        "#  define HAVE_SYS_POLL_H 1",
-        "#  define HAVE_SYS_RESOURCE_H 1",
-        "#  define HAVE_SYS_SELECT_H 1",
-        "#  define HAVE_SYS_SOCKET_H 1",
-        "#  define HAVE_SYS_STAT_H 1",
-        "#  define HAVE_SYS_TIME_H 1",
-        "#  define HAVE_SYS_TYPES_H 1",
-        "#  define HAVE_SYS_UIO_H 1",
-        "#  define HAVE_SYS_UN_H 1",
-        "#  define HAVE_SYS_WAIT_H 1",
-        "#  define HAVE_SYS_XATTR_H 1",
-        "#  define HAVE_TIME_H 1",
-        "#  define HAVE_UNAME 1",
-        "#  define HAVE_UNISTD_H 1",
-        "#  define HAVE_UTIME 1",
-        "#  define HAVE_UTIME_H 1",
-        "#  define HAVE_VARIADIC_MACROS_C99 1",
-        "#  define HAVE_VARIADIC_MACROS_GCC 1",
-        "#  define HAVE_WRITABLE_ARGV 1",
-        "#  define HAVE_WRITEV 1",
-        "#  define HAVE_ZLIB_H 1",
-        "#  define LT_OBJDIR \".libs/\"",
-        "#  define PACKAGE \"curl\"",
-        "#  define PACKAGE_BUGREPORT \"a suitable curl mailing list: https://curl.haxx.se/mail/\"",
-        "#  define PACKAGE_NAME \"curl\"",
-        "#  define PACKAGE_STRING \"curl -\"",
-        "#  define PACKAGE_TARNAME \"curl\"",
-        "#  define PACKAGE_URL \"\"",
-        "#  define PACKAGE_VERSION \"-\"",
-        "#  define RECV_TYPE_ARG1 int",
-        "#  define RECV_TYPE_ARG2 void *",
-        "#  define RECV_TYPE_ARG3 size_t",
-        "#  define RECV_TYPE_ARG4 int",
-        "#  define RECV_TYPE_RETV ssize_t",
-        "#  define RETSIGTYPE void",
-        "#  define SELECT_QUAL_ARG5",
-        "#  define SELECT_TYPE_ARG1 int",
-        "#  define SELECT_TYPE_ARG234 fd_set *",
-        "#  define SELECT_TYPE_ARG5 struct timeval *",
-        "#  define SELECT_TYPE_RETV int",
-        "#  define SEND_QUAL_ARG2 const",
-        "#  define SEND_TYPE_ARG1 int",
-        "#  define SEND_TYPE_ARG2 void *",
-        "#  define SEND_TYPE_ARG3 size_t",
-        "#  define SEND_TYPE_ARG4 int",
-        "#  define SEND_TYPE_RETV ssize_t",
-        "#  define SIZEOF_INT 4",
-        "#  define SIZEOF_LONG 8",
-        "#  define SIZEOF_OFF_T 8",
-        "#  define SIZEOF_CURL_OFF_T 8",
-        "#  define SIZEOF_SHORT 2",
-        "#  define SIZEOF_SIZE_T 8",
-        "#  define SIZEOF_TIME_T 8",
-        "#  define SIZEOF_VOIDP 8",
-        "#  define STDC_HEADERS 1",
-        "#  define STRERROR_R_TYPE_ARG3 size_t",
-        "#  define TIME_WITH_SYS_TIME 1",
-        "#  define VERSION \"-\"",
-        "#  ifndef _DARWIN_USE_64_BIT_INODE",
-        "#    define _DARWIN_USE_64_BIT_INODE 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#endif  // EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "EOF",
-    ]),
-)
diff --git a/third_party/xla/third_party/cython.BUILD b/third_party/xla/third_party/cython.BUILD
deleted file mode 100644
index ac8c33162d531d..00000000000000
--- a/third_party/xla/third_party/cython.BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-# Modified version of @cython//:BUILD.bazel
-
-py_library(
-    name = "cython_lib",
-    srcs = glob(
-        ["Cython/**/*.py"],
-        exclude = [
-            "**/Tests/*.py",
-        ],
-    ) + ["cython.py"],
-    data = glob([
-        "Cython/**/*.pyx",
-        "Cython/Utility/*.*",
-        "Cython/Includes/**/*.pxd",
-    ]),
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
-
-# May not be named "cython", since that conflicts with Cython/ on OSX
-py_binary(
-    name = "cython_binary",
-    srcs = ["cython.py"],
-    main = "cython.py",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-    deps = ["cython_lib"],
-)
diff --git a/third_party/xla/third_party/eigen3/BUILD b/third_party/xla/third_party/eigen3/BUILD
deleted file mode 100644
index 84a4205d6f383d..00000000000000
--- a/third_party/xla/third_party/eigen3/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Description:
-#   Eigen is a C++ template library for linear algebra: vectors,
-#   matrices, and related algorithms.
diff --git a/third_party/xla/third_party/eigen3/LICENSE b/third_party/xla/third_party/eigen3/LICENSE
deleted file mode 100644
index eff7afbbc25a29..00000000000000
--- a/third_party/xla/third_party/eigen3/LICENSE
+++ /dev/null
@@ -1,1072 +0,0 @@
-Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
-  http://www.mozilla.org/MPL/2.0/
-  http://www.mozilla.org/MPL/2.0/FAQ.html
-
-Some files contain third-party code under BSD or LGPL licenses, whence
-the other COPYING.* files here.
-
-All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later.
-For this reason, the COPYING.LGPL file contains the LGPL 2.1 text.
-
-If you want to guarantee that the Eigen code that you are #including
-is licensed under the MPL2 and possibly more permissive licenses (like
-BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY 
-For example, with most compilers, you could add this to your project
-      CXXFLAGS: -DEIGEN_MPL2_ONLY 
-This will cause a compilation error to be generated if you #include
-any code that is LGPL licensed.
-
-----------------------------------------------------------------------
-Following applies to:
-./test/mapstaticmethods.cpp
-./test/schur_real.cpp
-./test/prec_inverse_4x4.cpp
-./test/smallvectors.cpp
-./test/redux.cpp
-./test/special_numbers.cpp
-./test/adjoint.cpp
-./test/resize.cpp
-./test/mixingtypes.cpp
-./test/product_trmv.cpp
-./test/sparse_solvers.cpp
-./test/cholesky.cpp
-./test/geo_quaternion.cpp
-./test/miscmatrices.cpp
-./test/stddeque.cpp
-./test/integer_types.cpp
-./test/product_large.cpp
-./test/eigensolver_generic.cpp
-./test/householder.cpp
-./test/geo_orthomethods.cpp
-./test/array_for_matrix.cpp
-./test/sparseLM.cpp
-./test/upperbidiagonalization.cpp
-./test/nomalloc.cpp
-./test/packetmath.cpp
-./test/jacobisvd.cpp
-./test/geo_transformations.cpp
-./test/swap.cpp
-./test/eigensolver_selfadjoint.cpp
-./test/inverse.cpp
-./test/product_selfadjoint.cpp
-./test/product_trsolve.cpp
-./test/product_extra.cpp
-./test/sparse_solver.h
-./test/mapstride.cpp
-./test/mapped_matrix.cpp
-./test/geo_eulerangles.cpp
-./test/eigen2support.cpp
-./test/denseLM.cpp
-./test/stdvector.cpp
-./test/nesting_ops.cpp
-./test/sparse_permutations.cpp
-./test/zerosized.cpp
-./test/exceptions.cpp
-./test/vectorwiseop.cpp
-./test/cwiseop.cpp
-./test/basicstuff.cpp
-./test/product_trmm.cpp
-./test/linearstructure.cpp
-./test/sparse_product.cpp
-./test/stdvector_overload.cpp
-./test/stable_norm.cpp
-./test/umeyama.cpp
-./test/unalignedcount.cpp
-./test/triangular.cpp
-./test/product_mmtr.cpp
-./test/sparse_basic.cpp
-./test/sparse_vector.cpp
-./test/meta.cpp
-./test/real_qz.cpp
-./test/ref.cpp
-./test/eigensolver_complex.cpp
-./test/cholmod_support.cpp
-./test/conjugate_gradient.cpp
-./test/sparse.h
-./test/simplicial_cholesky.cpp
-./test/bicgstab.cpp
-./test/dynalloc.cpp
-./test/product_notemporary.cpp
-./test/geo_hyperplane.cpp
-./test/lu.cpp
-./test/qr.cpp
-./test/hessenberg.cpp
-./test/sizeof.cpp
-./test/main.h
-./test/selfadjoint.cpp
-./test/permutationmatrices.cpp
-./test/superlu_support.cpp
-./test/qtvector.cpp
-./test/geo_homogeneous.cpp
-./test/determinant.cpp
-./test/array_reverse.cpp
-./test/unalignedassert.cpp
-./test/stdlist.cpp
-./test/product_symm.cpp
-./test/corners.cpp
-./test/dontalign.cpp
-./test/visitor.cpp
-./test/geo_alignedbox.cpp
-./test/diagonalmatrices.cpp
-./test/product_small.cpp
-./test/eigensolver_generalized_real.cpp
-./test/umfpack_support.cpp
-./test/first_aligned.cpp
-./test/qr_fullpivoting.cpp
-./test/array_replicate.cpp
-./test/geo_parametrizedline.cpp
-./test/eigen2/eigen2_unalignedassert.cpp
-./test/eigen2/eigen2_prec_inverse_4x4.cpp
-./test/eigen2/eigen2_alignedbox.cpp
-./test/eigen2/eigen2_sparse_product.cpp
-./test/eigen2/eigen2_meta.cpp
-./test/eigen2/eigen2_nomalloc.cpp
-./test/eigen2/eigen2_visitor.cpp
-./test/eigen2/eigen2_packetmath.cpp
-./test/eigen2/eigen2_svd.cpp
-./test/eigen2/eigen2_mixingtypes.cpp
-./test/eigen2/eigen2_qr.cpp
-./test/eigen2/eigen2_cwiseop.cpp
-./test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
-./test/eigen2/eigen2_smallvectors.cpp
-./test/eigen2/eigen2_commainitializer.cpp
-./test/eigen2/eigen2_sparse_solvers.cpp
-./test/eigen2/eigen2_hyperplane.cpp
-./test/eigen2/eigen2_eigensolver.cpp
-./test/eigen2/eigen2_linearstructure.cpp
-./test/eigen2/eigen2_sizeof.cpp
-./test/eigen2/eigen2_parametrizedline.cpp
-./test/eigen2/eigen2_lu.cpp
-./test/eigen2/eigen2_adjoint.cpp
-./test/eigen2/eigen2_geometry.cpp
-./test/eigen2/eigen2_stdvector.cpp
-./test/eigen2/eigen2_newstdvector.cpp
-./test/eigen2/eigen2_submatrices.cpp
-./test/eigen2/sparse.h
-./test/eigen2/eigen2_swap.cpp
-./test/eigen2/eigen2_triangular.cpp
-./test/eigen2/eigen2_basicstuff.cpp
-./test/eigen2/gsl_helper.h
-./test/eigen2/eigen2_dynalloc.cpp
-./test/eigen2/eigen2_array.cpp
-./test/eigen2/eigen2_map.cpp
-./test/eigen2/main.h
-./test/eigen2/eigen2_miscmatrices.cpp
-./test/eigen2/eigen2_product_large.cpp
-./test/eigen2/eigen2_first_aligned.cpp
-./test/eigen2/eigen2_cholesky.cpp
-./test/eigen2/eigen2_determinant.cpp
-./test/eigen2/eigen2_sum.cpp
-./test/eigen2/eigen2_inverse.cpp
-./test/eigen2/eigen2_regression.cpp
-./test/eigen2/eigen2_product_small.cpp
-./test/eigen2/eigen2_qtvector.cpp
-./test/eigen2/eigen2_sparse_vector.cpp
-./test/eigen2/product.h
-./test/eigen2/eigen2_sparse_basic.cpp
-./test/eigen2/eigen2_bug_132.cpp
-./test/array.cpp
-./test/product_syrk.cpp
-./test/commainitializer.cpp
-./test/conservative_resize.cpp
-./test/qr_colpivoting.cpp
-./test/nullary.cpp
-./test/bandmatrix.cpp
-./test/pastix_support.cpp
-./test/product.h
-./test/block.cpp
-./test/vectorization_logic.cpp
-./test/jacobi.cpp
-./test/diagonal.cpp
-./test/schur_complex.cpp
-./test/sizeoverflow.cpp
-./bench/BenchTimer.h
-./bench/benchFFT.cpp
-./bench/eig33.cpp
-./bench/spbench/spbenchsolver.h
-./bench/spbench/spbenchstyle.h
-./lapack/complex_double.cpp
-./lapack/cholesky.cpp
-./lapack/lapack_common.h
-./lapack/eigenvalues.cpp
-./lapack/single.cpp
-./lapack/lu.cpp
-./lapack/complex_single.cpp
-./lapack/double.cpp
-./demos/mix_eigen_and_c/binary_library.cpp
-./demos/mix_eigen_and_c/binary_library.h
-./demos/mix_eigen_and_c/example.c
-./demos/mandelbrot/mandelbrot.cpp
-./demos/mandelbrot/mandelbrot.h
-./demos/opengl/icosphere.cpp
-./demos/opengl/icosphere.h
-./demos/opengl/camera.cpp
-./demos/opengl/quaternion_demo.h
-./demos/opengl/camera.h
-./demos/opengl/trackball.h
-./demos/opengl/gpuhelper.h
-./demos/opengl/trackball.cpp
-./demos/opengl/gpuhelper.cpp
-./demos/opengl/quaternion_demo.cpp
-./debug/gdb/printers.py
-./unsupported/test/minres.cpp
-./unsupported/test/openglsupport.cpp
-./unsupported/test/jacobisvd.cpp
-./unsupported/test/dgmres.cpp
-./unsupported/test/matrix_square_root.cpp
-./unsupported/test/bdcsvd.cpp
-./unsupported/test/matrix_exponential.cpp
-./unsupported/test/forward_adolc.cpp
-./unsupported/test/polynomialsolver.cpp
-./unsupported/test/matrix_function.cpp
-./unsupported/test/sparse_extra.cpp
-./unsupported/test/matrix_functions.h
-./unsupported/test/svd_common.h
-./unsupported/test/FFTW.cpp
-./unsupported/test/alignedvector3.cpp
-./unsupported/test/autodiff.cpp
-./unsupported/test/gmres.cpp
-./unsupported/test/BVH.cpp
-./unsupported/test/levenberg_marquardt.cpp
-./unsupported/test/matrix_power.cpp
-./unsupported/test/kronecker_product.cpp
-./unsupported/test/splines.cpp
-./unsupported/test/polynomialutils.cpp
-./unsupported/bench/bench_svd.cpp
-./unsupported/Eigen/IterativeSolvers
-./unsupported/Eigen/src/IterativeSolvers/DGMRES.h
-./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
-./unsupported/Eigen/src/IterativeSolvers/GMRES.h
-./unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
-./unsupported/Eigen/src/IterativeSolvers/Scaling.h
-./unsupported/Eigen/src/IterativeSolvers/MINRES.h
-./unsupported/Eigen/src/SparseExtra/RandomSetter.h
-./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
-./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
-./unsupported/Eigen/src/SparseExtra/MarketIO.h
-./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
-./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
-./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
-./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
-./unsupported/Eigen/src/BVH/BVAlgorithms.h
-./unsupported/Eigen/src/BVH/KdBVH.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
-./unsupported/Eigen/src/Splines/Spline.h
-./unsupported/Eigen/src/Splines/SplineFitting.h
-./unsupported/Eigen/src/Splines/SplineFwd.h
-./unsupported/Eigen/src/SVD/JacobiSVD.h
-./unsupported/Eigen/src/SVD/BDCSVD.h
-./unsupported/Eigen/src/SVD/SVDBase.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
-./unsupported/Eigen/src/MatrixFunctions/StemFunction.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
-./unsupported/Eigen/src/MoreVectorization/MathFunctions.h
-./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
-./unsupported/Eigen/src/FFT/ei_fftw_impl.h
-./unsupported/Eigen/src/FFT/ei_kissfft_impl.h
-./unsupported/Eigen/src/Polynomials/PolynomialSolver.h
-./unsupported/Eigen/src/Polynomials/Companion.h
-./unsupported/Eigen/src/Polynomials/PolynomialUtils.h
-./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
-./unsupported/Eigen/src/Skyline/SkylineProduct.h
-./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
-./unsupported/Eigen/src/Skyline/SkylineStorage.h
-./unsupported/Eigen/src/Skyline/SkylineUtil.h
-./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
-./unsupported/Eigen/src/Skyline/SkylineMatrix.h
-./unsupported/Eigen/SparseExtra
-./unsupported/Eigen/AdolcForward
-./unsupported/Eigen/KroneckerProduct
-./unsupported/Eigen/NonLinearOptimization
-./unsupported/Eigen/BVH
-./unsupported/Eigen/OpenGLSupport
-./unsupported/Eigen/ArpackSupport
-./unsupported/Eigen/AutoDiff
-./unsupported/Eigen/Splines
-./unsupported/Eigen/MPRealSupport
-./unsupported/Eigen/MatrixFunctions
-./unsupported/Eigen/MoreVectorization
-./unsupported/Eigen/LevenbergMarquardt
-./unsupported/Eigen/AlignedVector3
-./unsupported/Eigen/FFT
-./unsupported/Eigen/Polynomials
-./unsupported/Eigen/NumericalDiff
-./unsupported/Eigen/Skyline
-./COPYING.README
-./COPYING.README
-./LICENSE
-./LICENSE
-./LICENSE
-./Eigen/Eigen2Support
-./Eigen/src/Eigen2Support/VectorBlock.h
-./Eigen/src/Eigen2Support/Cwise.h
-./Eigen/src/Eigen2Support/Minor.h
-./Eigen/src/Eigen2Support/Lazy.h
-./Eigen/src/Eigen2Support/Memory.h
-./Eigen/src/Eigen2Support/MathFunctions.h
-./Eigen/src/Eigen2Support/Geometry/AlignedBox.h
-./Eigen/src/Eigen2Support/Geometry/Hyperplane.h
-./Eigen/src/Eigen2Support/Geometry/Quaternion.h
-./Eigen/src/Eigen2Support/Geometry/Rotation2D.h
-./Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
-./Eigen/src/Eigen2Support/Geometry/RotationBase.h
-./Eigen/src/Eigen2Support/Geometry/Translation.h
-./Eigen/src/Eigen2Support/Geometry/Scaling.h
-./Eigen/src/Eigen2Support/Geometry/AngleAxis.h
-./Eigen/src/Eigen2Support/Geometry/Transform.h
-./Eigen/src/Eigen2Support/TriangularSolver.h
-./Eigen/src/Eigen2Support/LU.h
-./Eigen/src/Eigen2Support/QR.h
-./Eigen/src/Eigen2Support/SVD.h
-./Eigen/src/Eigen2Support/Meta.h
-./Eigen/src/Eigen2Support/Block.h
-./Eigen/src/Eigen2Support/Macros.h
-./Eigen/src/Eigen2Support/LeastSquares.h
-./Eigen/src/Eigen2Support/CwiseOperators.h
-./Eigen/src/Jacobi/Jacobi.h
-./Eigen/src/misc/Kernel.h
-./Eigen/src/misc/SparseSolve.h
-./Eigen/src/misc/Solve.h
-./Eigen/src/misc/Image.h
-./Eigen/src/SparseCore/SparseColEtree.h
-./Eigen/src/SparseCore/SparseTranspose.h
-./Eigen/src/SparseCore/SparseUtil.h
-./Eigen/src/SparseCore/SparseCwiseBinaryOp.h
-./Eigen/src/SparseCore/SparseDiagonalProduct.h
-./Eigen/src/SparseCore/SparseProduct.h
-./Eigen/src/SparseCore/SparseDot.h
-./Eigen/src/SparseCore/SparseCwiseUnaryOp.h
-./Eigen/src/SparseCore/SparseSparseProductWithPruning.h
-./Eigen/src/SparseCore/SparseBlock.h
-./Eigen/src/SparseCore/SparseDenseProduct.h
-./Eigen/src/SparseCore/CompressedStorage.h
-./Eigen/src/SparseCore/SparseMatrixBase.h
-./Eigen/src/SparseCore/MappedSparseMatrix.h
-./Eigen/src/SparseCore/SparseTriangularView.h
-./Eigen/src/SparseCore/SparseView.h
-./Eigen/src/SparseCore/SparseFuzzy.h
-./Eigen/src/SparseCore/TriangularSolver.h
-./Eigen/src/SparseCore/SparseSelfAdjointView.h
-./Eigen/src/SparseCore/SparseMatrix.h
-./Eigen/src/SparseCore/SparseVector.h
-./Eigen/src/SparseCore/AmbiVector.h
-./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
-./Eigen/src/SparseCore/SparseRedux.h
-./Eigen/src/SparseCore/SparsePermutation.h
-./Eigen/src/Eigenvalues/RealSchur.h
-./Eigen/src/Eigenvalues/ComplexEigenSolver.h
-./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
-./Eigen/src/Eigenvalues/ComplexSchur.h
-./Eigen/src/Eigenvalues/RealQZ.h
-./Eigen/src/Eigenvalues/EigenSolver.h
-./Eigen/src/Eigenvalues/HessenbergDecomposition.h
-./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
-./Eigen/src/Eigenvalues/Tridiagonalization.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
-./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
-./Eigen/src/SuperLUSupport/SuperLUSupport.h
-./Eigen/src/StlSupport/StdDeque.h
-./Eigen/src/StlSupport/StdVector.h
-./Eigen/src/StlSupport/StdList.h
-./Eigen/src/StlSupport/details.h
-./Eigen/src/SparseQR/SparseQR.h
-./Eigen/src/LU/Inverse.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/Determinant.h
-./Eigen/src/LU/PartialPivLU.h
-./Eigen/src/LU/FullPivLU.h
-./Eigen/src/UmfPackSupport/UmfPackSupport.h
-./Eigen/src/OrderingMethods/Ordering.h
-./Eigen/src/OrderingMethods/Eigen_Colamd.h
-./Eigen/src/QR/HouseholderQR.h
-./Eigen/src/QR/ColPivHouseholderQR.h
-./Eigen/src/QR/FullPivHouseholderQR.h
-./Eigen/src/SVD/JacobiSVD.h
-./Eigen/src/SVD/UpperBidiagonalization.h
-./Eigen/src/Geometry/OrthoMethods.h
-./Eigen/src/Geometry/AlignedBox.h
-./Eigen/src/Geometry/Hyperplane.h
-./Eigen/src/Geometry/Quaternion.h
-./Eigen/src/Geometry/EulerAngles.h
-./Eigen/src/Geometry/Rotation2D.h
-./Eigen/src/Geometry/ParametrizedLine.h
-./Eigen/src/Geometry/RotationBase.h
-./Eigen/src/Geometry/arch/Geometry_SSE.h
-./Eigen/src/Geometry/Umeyama.h
-./Eigen/src/Geometry/Homogeneous.h
-./Eigen/src/Geometry/Translation.h
-./Eigen/src/Geometry/Scaling.h
-./Eigen/src/Geometry/AngleAxis.h
-./Eigen/src/Geometry/Transform.h
-./Eigen/src/plugins/BlockMethods.h
-./Eigen/src/plugins/CommonCwiseUnaryOps.h
-./Eigen/src/plugins/CommonCwiseBinaryOps.h
-./Eigen/src/plugins/MatrixCwiseUnaryOps.h
-./Eigen/src/plugins/MatrixCwiseBinaryOps.h
-./Eigen/src/Householder/Householder.h
-./Eigen/src/Householder/HouseholderSequence.h
-./Eigen/src/Householder/BlockHouseholder.h
-./Eigen/src/Core/VectorBlock.h
-./Eigen/src/Core/Matrix.h
-./Eigen/src/Core/Ref.h
-./Eigen/src/Core/SelfAdjointView.h
-./Eigen/src/Core/MathFunctions.h
-./Eigen/src/Core/GlobalFunctions.h
-./Eigen/src/Core/MapBase.h
-./Eigen/src/Core/EigenBase.h
-./Eigen/src/Core/GenericPacketMath.h
-./Eigen/src/Core/NestByValue.h
-./Eigen/src/Core/CwiseUnaryOp.h
-./Eigen/src/Core/SolveTriangular.h
-./Eigen/src/Core/Fuzzy.h
-./Eigen/src/Core/Visitor.h
-./Eigen/src/Core/Map.h
-./Eigen/src/Core/NoAlias.h
-./Eigen/src/Core/Diagonal.h
-./Eigen/src/Core/StableNorm.h
-./Eigen/src/Core/CoreIterators.h
-./Eigen/src/Core/products/Parallelizer.h
-./Eigen/src/Core/products/SelfadjointMatrixVector.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
-./Eigen/src/Core/products/TriangularSolverMatrix.h
-./Eigen/src/Core/products/GeneralMatrixMatrix.h
-./Eigen/src/Core/products/SelfadjointProduct.h
-./Eigen/src/Core/products/CoeffBasedProduct.h
-./Eigen/src/Core/products/TriangularMatrixVector.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix.h
-./Eigen/src/Core/products/TriangularSolverVector.h
-./Eigen/src/Core/products/SelfadjointRank2Update.h
-./Eigen/src/Core/products/GeneralBlockPanelKernel.h
-./Eigen/src/Core/products/GeneralMatrixVector.h
-./Eigen/src/Core/products/TriangularMatrixMatrix.h
-./Eigen/src/Core/Reverse.h
-./Eigen/src/Core/BooleanRedux.h
-./Eigen/src/Core/Replicate.h
-./Eigen/src/Core/arch/AltiVec/PacketMath.h
-./Eigen/src/Core/arch/AltiVec/Complex.h
-./Eigen/src/Core/arch/SSE/PacketMath.h
-./Eigen/src/Core/arch/SSE/Complex.h
-./Eigen/src/Core/arch/SSE/MathFunctions.h
-./Eigen/src/Core/arch/NEON/PacketMath.h
-./Eigen/src/Core/arch/NEON/Complex.h
-./Eigen/src/Core/arch/Default/Settings.h
-./Eigen/src/Core/CwiseUnaryView.h
-./Eigen/src/Core/Array.h
-./Eigen/src/Core/ArrayWrapper.h
-./Eigen/src/Core/Swap.h
-./Eigen/src/Core/Transpositions.h
-./Eigen/src/Core/Random.h
-./Eigen/src/Core/IO.h
-./Eigen/src/Core/SelfCwiseBinaryOp.h
-./Eigen/src/Core/VectorwiseOp.h
-./Eigen/src/Core/Select.h
-./Eigen/src/Core/ArrayBase.h
-./Eigen/src/Core/DenseCoeffsBase.h
-./Eigen/src/Core/DiagonalProduct.h
-./Eigen/src/Core/Assign.h
-./Eigen/src/Core/Redux.h
-./Eigen/src/Core/ForceAlignedAccess.h
-./Eigen/src/Core/BandMatrix.h
-./Eigen/src/Core/PlainObjectBase.h
-./Eigen/src/Core/DenseBase.h
-./Eigen/src/Core/Flagged.h
-./Eigen/src/Core/CwiseBinaryOp.h
-./Eigen/src/Core/ProductBase.h
-./Eigen/src/Core/TriangularMatrix.h
-./Eigen/src/Core/Transpose.h
-./Eigen/src/Core/DiagonalMatrix.h
-./Eigen/src/Core/Dot.h
-./Eigen/src/Core/Functors.h
-./Eigen/src/Core/PermutationMatrix.h
-./Eigen/src/Core/NumTraits.h
-./Eigen/src/Core/MatrixBase.h
-./Eigen/src/Core/DenseStorage.h
-./Eigen/src/Core/util/Memory.h
-./Eigen/src/Core/util/StaticAssert.h
-./Eigen/src/Core/util/BlasUtil.h
-./Eigen/src/Core/util/MatrixMapper.h
-./Eigen/src/Core/util/XprHelper.h
-./Eigen/src/Core/util/ForwardDeclarations.h
-./Eigen/src/Core/util/Meta.h
-./Eigen/src/Core/util/Macros.h
-./Eigen/src/Core/util/Constants.h
-./Eigen/src/Core/CwiseNullaryOp.h
-./Eigen/src/Core/Block.h
-./Eigen/src/Core/GeneralProduct.h
-./Eigen/src/Core/CommaInitializer.h
-./Eigen/src/Core/ReturnByValue.h
-./Eigen/src/Core/Stride.h
-./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
-./Eigen/src/SparseLU/SparseLU_column_dfs.h
-./Eigen/src/SparseLU/SparseLU_panel_dfs.h
-./Eigen/src/SparseLU/SparseLU_relax_snode.h
-./Eigen/src/SparseLU/SparseLU_panel_bmod.h
-./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
-./Eigen/src/SparseLU/SparseLU_Utils.h
-./Eigen/src/SparseLU/SparseLU_gemm_kernel.h
-./Eigen/src/SparseLU/SparseLU_kernel_bmod.h
-./Eigen/src/SparseLU/SparseLU_pivotL.h
-./Eigen/src/SparseLU/SparseLU_Memory.h
-./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
-./Eigen/src/SparseLU/SparseLUImpl.h
-./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
-./Eigen/src/SparseLU/SparseLU_Structs.h
-./Eigen/src/SparseLU/SparseLU.h
-./Eigen/src/SparseLU/SparseLU_column_bmod.h
-./Eigen/src/SparseLU/SparseLU_pruneL.h
-./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
-./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
-./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
-./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
-./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
-./Eigen/src/SparseCholesky/SimplicialCholesky.h
-./Eigen/src/Cholesky/LDLT.h
-./Eigen/src/Cholesky/LLT.h
-./Eigen/src/CholmodSupport/CholmodSupport.h
-./Eigen/src/PaStiXSupport/PaStiXSupport.h
-./Eigen/src/MetisSupport/MetisSupport.h
-./Eigen/StdVector
-./Eigen/Core
-./Eigen/OrderingMethods
-./Eigen/SparseLU
-./Eigen/StdList
-./Eigen/StdDeque
-./Eigen/SparseCholesky
-./Eigen/SparseCore
-./scripts/relicense.py
-./scripts/relicense.py
-./blas/BandTriangularSolver.h
-./blas/PackedTriangularMatrixVector.h
-./blas/complex_double.cpp
-./blas/level2_real_impl.h
-./blas/level1_cplx_impl.h
-./blas/level1_impl.h
-./blas/level1_real_impl.h
-./blas/level3_impl.h
-./blas/single.cpp
-./blas/level2_cplx_impl.h
-./blas/PackedSelfadjointProduct.h
-./blas/Rank2Update.h
-./blas/complex_single.cpp
-./blas/PackedTriangularSolverVector.h
-./blas/double.cpp
-./blas/common.h
-./blas/level2_impl.h
-./blas/GeneralRank1Update.h
-
-Mozilla Public License Version 2.0
-==================================
-
-1. Definitions
---------------
-
-1.1. "Contributor"
-    means each individual or legal entity that creates, contributes to
-    the creation of, or owns Covered Software.
-
-1.2. "Contributor Version"
-    means the combination of the Contributions of others (if any) used
-    by a Contributor and that particular Contributor's Contribution.
-
-1.3. "Contribution"
-    means Covered Software of a particular Contributor.
-
-1.4. "Covered Software"
-    means Source Code Form to which the initial Contributor has attached
-    the notice in Exhibit A, the Executable Form of such Source Code
-    Form, and Modifications of such Source Code Form, in each case
-    including portions thereof.
-
-1.5. "Incompatible With Secondary Licenses"
-    means
-
-    (a) that the initial Contributor has attached the notice described
-        in Exhibit B to the Covered Software; or
-
-    (b) that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the
-        terms of a Secondary License.
-
-1.6. "Executable Form"
-    means any form of the work other than Source Code Form.
-
-1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
-    a separate file or files, that is not Covered Software.
-
-1.8. "License"
-    means this document.
-
-1.9. "Licensable"
-    means having the right to grant, to the maximum extent possible,
-    whether at the time of the initial grant or subsequently, any and
-    all of the rights conveyed by this License.
-
-1.10. "Modifications"
-    means any of the following:
-
-    (a) any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered
-        Software; or
-
-    (b) any new file in Source Code Form that contains any Covered
-        Software.
-
-1.11. "Patent Claims" of a Contributor
-    means any patent claim(s), including without limitation, method,
-    process, and apparatus claims, in any patent Licensable by such
-    Contributor that would be infringed, but for the grant of the
-    License, by the making, using, selling, offering for sale, having
-    made, import, or transfer of either its Contributions or its
-    Contributor Version.
-
-1.12. "Secondary License"
-    means either the GNU General Public License, Version 2.0, the GNU
-    Lesser General Public License, Version 2.1, the GNU Affero General
-    Public License, Version 3.0, or any later versions of those
-    licenses.
-
-1.13. "Source Code Form"
-    means the form of the work preferred for making modifications.
-
-1.14. "You" (or "Your")
-    means an individual or a legal entity exercising rights under this
-    License. For legal entities, "You" includes any entity that
-    controls, is controlled by, or is under common control with You. For
-    purposes of this definition, "control" means (a) the power, direct
-    or indirect, to cause the direction or management of such entity,
-    whether by contract or otherwise, or (b) ownership of more than
-    fifty percent (50%) of the outstanding shares or beneficial
-    ownership of such entity.
-
-2. License Grants and Conditions
---------------------------------
-
-2.1. Grants
-
-Each Contributor hereby grants You a world-wide, royalty-free,
-non-exclusive license:
-
-(a) under intellectual property rights (other than patent or trademark)
-    Licensable by such Contributor to use, reproduce, make available,
-    modify, display, perform, distribute, and otherwise exploit its
-    Contributions, either on an unmodified basis, with Modifications, or
-    as part of a Larger Work; and
-
-(b) under Patent Claims of such Contributor to make, use, sell, offer
-    for sale, have made, import, and otherwise transfer either its
-    Contributions or its Contributor Version.
-
-2.2. Effective Date
-
-The licenses granted in Section 2.1 with respect to any Contribution
-become effective for each Contribution on the date the Contributor first
-distributes such Contribution.
-
-2.3. Limitations on Grant Scope
-
-The licenses granted in this Section 2 are the only rights granted under
-this License. No additional rights or licenses will be implied from the
-distribution or licensing of Covered Software under this License.
-Notwithstanding Section 2.1(b) above, no patent license is granted by a
-Contributor:
-
-(a) for any code that a Contributor has removed from Covered Software;
-    or
-
-(b) for infringements caused by: (i) Your and any other third party's
-    modifications of Covered Software, or (ii) the combination of its
-    Contributions with other software (except as part of its Contributor
-    Version); or
-
-(c) under Patent Claims infringed by Covered Software in the absence of
-    its Contributions.
-
-This License does not grant any rights in the trademarks, service marks,
-or logos of any Contributor (except as may be necessary to comply with
-the notice requirements in Section 3.4).
-
-2.4. Subsequent Licenses
-
-No Contributor makes additional grants as a result of Your choice to
-distribute the Covered Software under a subsequent version of this
-License (see Section 10.2) or under the terms of a Secondary License (if
-permitted under the terms of Section 3.3).
-
-2.5. Representation
-
-Each Contributor represents that the Contributor believes its
-Contributions are its original creation(s) or it has sufficient rights
-to grant the rights to its Contributions conveyed by this License.
-
-2.6. Fair Use
-
-This License is not intended to limit any rights You have under
-applicable copyright doctrines of fair use, fair dealing, or other
-equivalents.
-
-2.7. Conditions
-
-Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
-in Section 2.1.
-
-3. Responsibilities
--------------------
-
-3.1. Distribution of Source Form
-
-All distribution of Covered Software in Source Code Form, including any
-Modifications that You create or to which You contribute, must be under
-the terms of this License. You must inform recipients that the Source
-Code Form of the Covered Software is governed by the terms of this
-License, and how they can obtain a copy of this License. You may not
-attempt to alter or restrict the recipients' rights in the Source Code
-Form.
-
-3.2. Distribution of Executable Form
-
-If You distribute Covered Software in Executable Form then:
-
-(a) such Covered Software must also be made available in Source Code
-    Form, as described in Section 3.1, and You must inform recipients of
-    the Executable Form how they can obtain a copy of such Source Code
-    Form by reasonable means in a timely manner, at a charge no more
-    than the cost of distribution to the recipient; and
-
-(b) You may distribute such Executable Form under the terms of this
-    License, or sublicense it under different terms, provided that the
-    license for the Executable Form does not attempt to limit or alter
-    the recipients' rights in the Source Code Form under this License.
-
-3.3. Distribution of a Larger Work
-
-You may create and distribute a Larger Work under terms of Your choice,
-provided that You also comply with the requirements of this License for
-the Covered Software. If the Larger Work is a combination of Covered
-Software with a work governed by one or more Secondary Licenses, and the
-Covered Software is not Incompatible With Secondary Licenses, this
-License permits You to additionally distribute such Covered Software
-under the terms of such Secondary License(s), so that the recipient of
-the Larger Work may, at their option, further distribute the Covered
-Software under the terms of either this License or such Secondary
-License(s).
-
-3.4. Notices
-
-You may not remove or alter the substance of any license notices
-(including copyright notices, patent notices, disclaimers of warranty,
-or limitations of liability) contained within the Source Code Form of
-the Covered Software, except that You may alter any license notices to
-the extent required to remedy known factual inaccuracies.
-
-3.5. Application of Additional Terms
-
-You may choose to offer, and to charge a fee for, warranty, support,
-indemnity or liability obligations to one or more recipients of Covered
-Software. However, You may do so only on Your own behalf, and not on
-behalf of any Contributor. You must make it absolutely clear that any
-such warranty, support, indemnity, or liability obligation is offered by
-You alone, and You hereby agree to indemnify every Contributor for any
-liability incurred by such Contributor as a result of warranty, support,
-indemnity or liability terms You offer. You may include additional
-disclaimers of warranty and limitations of liability specific to any
-jurisdiction.
-
-4. Inability to Comply Due to Statute or Regulation
----------------------------------------------------
-
-If it is impossible for You to comply with any of the terms of this
-License with respect to some or all of the Covered Software due to
-statute, judicial order, or regulation then You must: (a) comply with
-the terms of this License to the maximum extent possible; and (b)
-describe the limitations and the code they affect. Such description must
-be placed in a text file included with all distributions of the Covered
-Software under this License. Except to the extent prohibited by statute
-or regulation, such description must be sufficiently detailed for a
-recipient of ordinary skill to be able to understand it.
-
-5. Termination
---------------
-
-5.1. The rights granted under this License will terminate automatically
-if You fail to comply with any of its terms. However, if You become
-compliant, then the rights granted under this License from a particular
-Contributor are reinstated (a) provisionally, unless and until such
-Contributor explicitly and finally terminates Your grants, and (b) on an
-ongoing basis, if such Contributor fails to notify You of the
-non-compliance by some reasonable means prior to 60 days after You have
-come back into compliance. Moreover, Your grants from a particular
-Contributor are reinstated on an ongoing basis if such Contributor
-notifies You of the non-compliance by some reasonable means, this is the
-first time You have received notice of non-compliance with this License
-from such Contributor, and You become compliant prior to 30 days after
-Your receipt of the notice.
-
-5.2. If You initiate litigation against any entity by asserting a patent
-infringement claim (excluding declaratory judgment actions,
-counter-claims, and cross-claims) alleging that a Contributor Version
-directly or indirectly infringes any patent, then the rights granted to
-You by any and all Contributors for the Covered Software under Section
-2.1 of this License shall terminate.
-
-5.3. In the event of termination under Sections 5.1 or 5.2 above, all
-end user license agreements (excluding distributors and resellers) which
-have been validly granted by You or Your distributors under this License
-prior to termination shall survive termination.
-
-************************************************************************
-*                                                                      *
-*  6. Disclaimer of Warranty                                           *
-*  -------------------------                                           *
-*                                                                      *
-*  Covered Software is provided under this License on an "as is"       *
-*  basis, without warranty of any kind, either expressed, implied, or  *
-*  statutory, including, without limitation, warranties that the       *
-*  Covered Software is free of defects, merchantable, fit for a        *
-*  particular purpose or non-infringing. The entire risk as to the     *
-*  quality and performance of the Covered Software is with You.        *
-*  Should any Covered Software prove defective in any respect, You     *
-*  (not any Contributor) assume the cost of any necessary servicing,   *
-*  repair, or correction. This disclaimer of warranty constitutes an   *
-*  essential part of this License. No use of any Covered Software is   *
-*  authorized under this License except under this disclaimer.         *
-*                                                                      *
-************************************************************************
-
-************************************************************************
-*                                                                      *
-*  7. Limitation of Liability                                          *
-*  --------------------------                                          *
-*                                                                      *
-*  Under no circumstances and under no legal theory, whether tort      *
-*  (including negligence), contract, or otherwise, shall any           *
-*  Contributor, or anyone who distributes Covered Software as          *
-*  permitted above, be liable to You for any direct, indirect,         *
-*  special, incidental, or consequential damages of any character      *
-*  including, without limitation, damages for lost profits, loss of    *
-*  goodwill, work stoppage, computer failure or malfunction, or any    *
-*  and all other commercial damages or losses, even if such party      *
-*  shall have been informed of the possibility of such damages. This   *
-*  limitation of liability shall not apply to liability for death or   *
-*  personal injury resulting from such party's negligence to the       *
-*  extent applicable law prohibits such limitation. Some               *
-*  jurisdictions do not allow the exclusion or limitation of           *
-*  incidental or consequential damages, so this exclusion and          *
-*  limitation may not apply to You.                                    *
-*                                                                      *
-************************************************************************
-
-8. Litigation
--------------
-
-Any litigation relating to this License may be brought only in the
-courts of a jurisdiction where the defendant maintains its principal
-place of business and such litigation shall be governed by laws of that
-jurisdiction, without reference to its conflict-of-law provisions.
-Nothing in this Section shall prevent a party's ability to bring
-cross-claims or counter-claims.
-
-9. Miscellaneous
-----------------
-
-This License represents the complete agreement concerning the subject
-matter hereof. If any provision of this License is held to be
-unenforceable, such provision shall be reformed only to the extent
-necessary to make it enforceable. Any law or regulation which provides
-that the language of a contract shall be construed against the drafter
-shall not be used to construe this License against a Contributor.
-
-10. Versions of the License
----------------------------
-
-10.1. New Versions
-
-Mozilla Foundation is the license steward. Except as provided in Section
-10.3, no one other than the license steward has the right to modify or
-publish new versions of this License. Each version will be given a
-distinguishing version number.
-
-10.2. Effect of New Versions
-
-You may distribute the Covered Software under the terms of the version
-of the License under which You originally received the Covered Software,
-or under the terms of any subsequent version published by the license
-steward.
-
-10.3. Modified Versions
-
-If you create software not governed by this License, and you want to
-create a new license for such software, you may create and use a
-modified version of this License if you rename the license and remove
-any references to the name of the license steward (except to note that
-such modified license differs from this License).
-
-10.4. Distributing Source Code Form that is Incompatible With Secondary
-Licenses
-
-If You choose to distribute Source Code Form that is Incompatible With
-Secondary Licenses under the terms of this version of the License, the
-notice described in Exhibit B of this License must be attached.
-
-Exhibit A - Source Code Form License Notice
--------------------------------------------
-
-  This Source Code Form is subject to the terms of the Mozilla Public
-  License, v. 2.0. If a copy of the MPL was not distributed with this
-  file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-If it is not possible or desirable to put the notice in a particular
-file, then You may include the notice in a location (such as a LICENSE
-file in a relevant directory) where a recipient would be likely to look
-for such a notice.
-
-You may add additional accurate notices of copyright ownership.
-
-Exhibit B - "Incompatible With Secondary Licenses" Notice
----------------------------------------------------------
-
-  This Source Code Form is "Incompatible With Secondary Licenses", as
-  defined by the Mozilla Public License, v. 2.0.
-
-----------------------------------------------------------------------
-Following applies to:
-./doc/UsingIntelMKL.dox
-./doc/UsingIntelMKL.dox
-./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
-./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
-./Eigen/src/Eigenvalues/RealSchur_MKL.h
-./Eigen/src/Eigenvalues/RealSchur_MKL.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/PartialPivLU_MKL.h
-./Eigen/src/LU/PartialPivLU_MKL.h
-./Eigen/src/QR/HouseholderQR_MKL.h
-./Eigen/src/QR/HouseholderQR_MKL.h
-./Eigen/src/QR/ColPivHouseholderQR_MKL.h
-./Eigen/src/QR/ColPivHouseholderQR_MKL.h
-./Eigen/src/SVD/JacobiSVD_MKL.h
-./Eigen/src/SVD/JacobiSVD_MKL.h
-./Eigen/src/PardisoSupport/PardisoSupport.h
-./Eigen/src/PardisoSupport/PardisoSupport.h
-./Eigen/src/Core/Assign_MKL.h
-./Eigen/src/Core/Assign_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
-./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
-./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
-./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
-./Eigen/src/Core/util/MKL_support.h
-./Eigen/src/Core/util/MKL_support.h
-./Eigen/src/Cholesky/LLT_MKL.h
-./Eigen/src/Cholesky/LLT_MKL.h
-
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.  *
-   Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the
-   distribution.  * Neither the name of Intel Corporation nor the
-   names of its contributors may be used to endorse or promote
-   products derived from this software without specific prior written
-   permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-----------------------------------------------------------------------
-Following applies to:
-./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
-
-Minpack Copyright Notice (1999) University of Chicago.  All rights
-reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
-   "This product includes software developed by the
-   University of Chicago, as Operator of Argonne National
-   Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
diff --git a/third_party/xla/third_party/eigen3/eigen_archive.BUILD b/third_party/xla/third_party/eigen3/eigen_archive.BUILD
deleted file mode 100644
index 78b1fc872f8ada..00000000000000
--- a/third_party/xla/third_party/eigen3/eigen_archive.BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# Description:
-#   Eigen is a C++ template library for linear algebra: vectors,
-#   matrices, and related algorithms.
-# This is the BUILD file used for the @eigen_archive external repository.
-
-licenses([
-    "reciprocal",  # MPL2
-    "notice",  # Portions BSD
-])
-
-exports_files(["COPYING.MPL2"])
-
-ALL_FILES_WITH_EXTENSIONS = glob(["**/*.*"])
-
-# Top-level headers, excluding anything in one of the  ../src/.. directories.
-EIGEN_HEADERS = glob(
-    [
-        "Eigen/*",
-        "unsupported/Eigen/*",
-        "unsupported/Eigen/CXX11/*",
-    ],
-    exclude = [
-        "**/src/**",
-    ] + ALL_FILES_WITH_EXTENSIONS,
-)
-
-# Internal eigen headers.
-EIGEN_SOURCES = glob(
-    [
-        "Eigen/**/src/**/*.h",
-        "Eigen/**/src/**/*.inc",
-        "unsupported/Eigen/**/src/**/*.h",
-        "unsupported/Eigen/**/src/**/*.inc",
-    ],
-)
-
-cc_library(
-    name = "eigen3",
-    srcs = EIGEN_SOURCES,
-    hdrs = EIGEN_HEADERS,
-    defines = [
-        "EIGEN_MAX_ALIGN_BYTES=64",
-        "EIGEN_ALLOW_UNALIGNED_SCALARS",  # TODO(b/296071640): Remove when underlying bugs are fixed.
-        "EIGEN_USE_AVX512_GEMM_KERNELS=0",  # TODO(b/238649163): Remove this once no longer necessary.
-    ],
-    includes = [
-        ".",  # Third-party libraries include eigen relative to its root.
-        "./mkl_include",  # For using MKL backend for Eigen when available.
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "eigen_header_files",
-    srcs = EIGEN_HEADERS,
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "eigen_source_files",
-    srcs = EIGEN_SOURCES,
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/eigen3/workspace.bzl b/third_party/xla/third_party/eigen3/workspace.bzl
deleted file mode 100644
index 027454e46dd251..00000000000000
--- a/third_party/xla/third_party/eigen3/workspace.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Provides the repository macro to import Eigen."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports Eigen."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    EIGEN_COMMIT = "aa6964bf3a34fd607837dd8123bc42465185c4f8"
-    EIGEN_SHA256 = "35ba771e30c735a4215ed784d7e032086cf89fe6622dce4d793c45dd74373362"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
-
-    tf_http_archive(
-        name = "eigen_archive",
-        build_file = "//third_party/eigen3:eigen_archive.BUILD",
-        sha256 = EIGEN_SHA256,
-        strip_prefix = "eigen-{commit}".format(commit = EIGEN_COMMIT),
-        urls = tf_mirror_urls("https://gitlab.com/libeigen/eigen/-/archive/{commit}/eigen-{commit}.tar.gz".format(commit = EIGEN_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/farmhash/BUILD b/third_party/xla/third_party/farmhash/BUILD
deleted file mode 100644
index 3c413807167aeb..00000000000000
--- a/third_party/xla/third_party/farmhash/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/farmhash/farmhash.BUILD b/third_party/xla/third_party/farmhash/farmhash.BUILD
deleted file mode 100644
index 4b8464684ae61a..00000000000000
--- a/third_party/xla/third_party/farmhash/farmhash.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-licenses(["notice"])  # MIT
-
-exports_files(["COPYING"])
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
-
-cc_library(
-    name = "farmhash",
-    srcs = ["src/farmhash.cc"],
-    hdrs = ["src/farmhash.h"],
-    # Disable __builtin_expect support on Windows
-    copts = select({
-        ":windows": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
-        "//conditions:default": [],
-    }),
-    includes = ["src/."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/farmhash/farmhash_gpu.BUILD b/third_party/xla/third_party/farmhash/farmhash_gpu.BUILD
deleted file mode 100644
index 78e551de54138d..00000000000000
--- a/third_party/xla/third_party/farmhash/farmhash_gpu.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-# This is a modified farmhash to only include GPU-related functions.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # MIT
-
-cc_library(
-    name = "farmhash_gpu",
-    hdrs = ["src/farmhash_gpu.h"],
-    include_prefix = "third_party/farmhash_gpu",
-)
diff --git a/third_party/xla/third_party/farmhash/farmhash_support_cuda.patch b/third_party/xla/third_party/farmhash/farmhash_support_cuda.patch
deleted file mode 100644
index a5400a0dbfcfa1..00000000000000
--- a/third_party/xla/third_party/farmhash/farmhash_support_cuda.patch
+++ /dev/null
@@ -1,289 +0,0 @@
-From eb130493c8042280a01e03c28bb89bd5ae0c5d18 Mon Sep 17 00:00:00 2001
-From: Kaixi Hou <kaixih@nvidia.com>
-Date: Tue, 23 Mar 2021 12:49:18 -0700
-Subject: [PATCH] Add device modifiers for GPUs
-
----
- src/{farmhash.cc => farmhash_gpu.h} | 95 +++++++++++++++++++++++------
- 1 file changed, 75 insertions(+), 20 deletions(-)
- rename src/{farmhash.cc => farmhash_gpu.h} (99%)
-
-diff --git a/src/farmhash.cc b/src/farmhash_gpu.h
-similarity index 99%
-rename from src/farmhash.cc
-rename to src/farmhash_gpu.h
-index cfd4a47..50994b6 100644
---- a/src/farmhash.cc
-+++ b/src/farmhash_gpu.h
-@@ -20,6 +20,17 @@
- //
- // FarmHash, by Geoff Pike
- 
-+#ifndef FARM_HASH_GPU_H_
-+#define FARM_HASH_GPU_H_
-+
-+#include <cstdint>
-+#include <string.h>   // for memcpy and memset
-+
-+#define NAMESPACE_FOR_HASH_FUNCTIONS_GPU util_gpu
-+#define DEVICE_MODIFIER __device__ __host__
-+
-+// We use DEVICE_MODIFIER to remove those code unused by GPUs.
-+#ifndef DEVICE_MODIFIER
- #include "farmhash.h"
- // FARMHASH ASSUMPTIONS: Modify as needed, or use -DFARMHASH_ASSUME_SSE42 etc.
- // Note that if you use -DFARMHASH_ASSUME_SSE42 you likely need -msse42
-@@ -187,7 +198,14 @@
- #define uint64_in_expected_order(x) (x)
- #endif
- 
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+#endif // DEVICE_MODIFIER
-+
-+#define uint32_in_expected_order(x) (x)
-+#define uint64_in_expected_order(x) (x)
-+
-+#define STATIC_INLINE DEVICE_MODIFIER inline
-+
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
- STATIC_INLINE uint64_t Fetch64(const char *p) {
-   uint64_t result;
-@@ -201,6 +219,7 @@ STATIC_INLINE uint32_t Fetch32(const char *p) {
-   return uint32_in_expected_order(result);
- }
- 
-+#ifndef DEVICE_MODIFIER
- STATIC_INLINE uint32_t Bswap32(uint32_t val) { return bswap_32(val); }
- STATIC_INLINE uint64_t Bswap64(uint64_t val) { return bswap_64(val); }
- 
-@@ -210,12 +229,14 @@ STATIC_INLINE uint32_t BasicRotate32(uint32_t val, int shift) {
-   // Avoid shifting by 32: doing so yields an undefined result.
-   return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
- }
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t BasicRotate64(uint64_t val, int shift) {
-   // Avoid shifting by 64: doing so yields an undefined result.
-   return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
- }
- 
-+#ifndef DEVICE_MODIFIER
- #if defined(_WIN32) && defined(FARMHASH_ROTR)
- 
- STATIC_INLINE uint32_t Rotate32(uint32_t val, int shift) {
-@@ -240,12 +261,18 @@ STATIC_INLINE uint64_t Rotate64(uint64_t val, int shift) {
- }
- 
- #endif
-+#endif // DEVICE_MODIFIER
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
-+STATIC_INLINE uint64_t Rotate64(uint64_t val, int shift) {
-+  return BasicRotate64(val, shift);
-+}
-+
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
- // FARMHASH PORTABILITY LAYER: debug mode or max speed?
- // One may use -DFARMHASH_DEBUG=1 or -DFARMHASH_DEBUG=0 to force the issue.
- 
-+#ifndef DEVICE_MODIFIER
- #if !defined(FARMHASH_DEBUG) && (!defined(NDEBUG) || defined(_DEBUG))
- #define FARMHASH_DEBUG 1
- #endif
-@@ -345,14 +372,21 @@ STATIC_INLINE __m128i Fetch128(const char* s) {
- 
- #undef PERMUTE3
- #define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0)
-+#endif // DEVICE_MODIFIER
-+
-+struct Pair {
-+  uint64_t first;
-+  uint64_t second;
-+};
- 
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
- // Some primes between 2^63 and 2^64 for various uses.
- static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
- static const uint64_t k1 = 0xb492b66fbe98f273ULL;
- static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
- 
-+#ifndef DEVICE_MODIFIER
- // Magic numbers for 32-bit hashing.  Copied from Murmur3.
- static const uint32_t c1 = 0xcc9e2d51;
- static const uint32_t c2 = 0x1b873593;
-@@ -399,28 +433,34 @@ template <> uint128_t DebugTweak(uint128_t x) {
-   }
-   return x;
- }
-+#endif // DEVICE_MODIFIER
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
--
-+#ifndef DEVICE_MODIFIER
- using namespace std;
--using namespace NAMESPACE_FOR_HASH_FUNCTIONS;
--namespace farmhashna {
-+#endif // DEVICE_MODIFIER
-+using namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU;
-+namespace farmhashna_gpu {
- #undef Fetch
- #define Fetch Fetch64
- 
- #undef Rotate
- #define Rotate Rotate64
- 
-+#ifndef DEVICE_MODIFIER
- #undef Bswap
- #define Bswap Bswap64
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
-   return val ^ (val >> 47);
- }
- 
-+#ifndef DEVICE_MODIFIER
- STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v) {
-   return Hash128to64(Uint128(u, v));
- }
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
-   // Murmur-inspired hashing.
-@@ -471,7 +511,7 @@ STATIC_INLINE uint64_t HashLen17to32(const char *s, size_t len) {
- 
- // Return a 16-byte hash for 48 bytes.  Quick and dirty.
- // Callers do best to use "random-looking" values for a and b.
--STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-+STATIC_INLINE Pair WeakHashLen32WithSeeds(
-     uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) {
-   a += w;
-   b = Rotate(b + a + z, 21);
-@@ -479,11 +519,11 @@ STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-   a += x;
-   a += y;
-   b += Rotate(a, 44);
--  return make_pair(a + z, b + c);
-+  return Pair{a + z, b + c};
- }
- 
- // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
--STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-+STATIC_INLINE Pair WeakHashLen32WithSeeds(
-     const char* s, uint64_t a, uint64_t b) {
-   return WeakHashLen32WithSeeds(Fetch(s),
-                                 Fetch(s + 8),
-@@ -510,7 +550,7 @@ STATIC_INLINE uint64_t HashLen33to64(const char *s, size_t len) {
-                    e + Rotate(f + a, 18) + g, mul);
- }
- 
--uint64_t Hash64(const char *s, size_t len) {
-+DEVICE_MODIFIER uint64_t Hash64(const char *s, size_t len) {
-   const uint64_t seed = 81;
-   if (len <= 32) {
-     if (len <= 16) {
-@@ -527,8 +567,8 @@ uint64_t Hash64(const char *s, size_t len) {
-   uint64_t x = seed;
-   uint64_t y = seed * k1 + 113;
-   uint64_t z = ShiftMix(y * k2 + 113) * k2;
--  pair<uint64_t, uint64_t> v = make_pair(0, 0);
--  pair<uint64_t, uint64_t> w = make_pair(0, 0);
-+  Pair v = {0, 0};
-+  Pair w = {0, 0};
-   x = x * k2 + Fetch(s);
- 
-   // Set end so that after the loop we have 1 to 64 bytes left to process.
-@@ -543,7 +583,9 @@ uint64_t Hash64(const char *s, size_t len) {
-     z = Rotate(z + w.first, 33) * k1;
-     v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
-     w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch(s + 16));
--    std::swap(z, x);
-+    auto tmp = z;
-+    z = x;
-+    x = tmp;
-     s += 64;
-   } while (s != end);
-   uint64_t mul = k1 + ((z & 0xff) << 1);
-@@ -559,12 +601,15 @@ uint64_t Hash64(const char *s, size_t len) {
-   z = Rotate(z + w.first, 33) * mul;
-   v = WeakHashLen32WithSeeds(s, v.second * mul, x + w.first);
-   w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch(s + 16));
--  std::swap(z, x);
-+  auto tmp = z;
-+  z = x;
-+  x = tmp;
-   return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z,
-                    HashLen16(v.second, w.second, mul) + x,
-                    mul);
- }
- 
-+#ifndef DEVICE_MODIFIER
- uint64_t Hash64WithSeeds(const char *s, size_t len, uint64_t seed0, uint64_t seed1);
- 
- uint64_t Hash64WithSeed(const char *s, size_t len, uint64_t seed) {
-@@ -574,7 +619,9 @@ uint64_t Hash64WithSeed(const char *s, size_t len, uint64_t seed) {
- uint64_t Hash64WithSeeds(const char *s, size_t len, uint64_t seed0, uint64_t seed1) {
-   return HashLen16(Hash64(s, len) - seed0, seed1);
- }
--}  // namespace farmhashna
-+#endif // DEVICE_MODIFIER
-+}  // namespace farmhashna_gpu
-+#ifndef DEVICE_MODIFIER
- namespace farmhashuo {
- #undef Fetch
- #define Fetch Fetch64
-@@ -1864,8 +1911,10 @@ uint128_t Fingerprint128(const char* s, size_t len) {
-   return CityHash128(s, len);
- }
- }  // namespace farmhashcc
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+#endif // DEVICE_MODIFIER
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
-+#ifndef DEVICE_MODIFIER
- // BASIC STRING HASHING
- 
- // Hash function for a byte array.  See also Hash(), below.
-@@ -1948,12 +1997,14 @@ uint128_t Hash128WithSeed(const char* s, size_t len, uint128_t seed) {
- uint32_t Fingerprint32(const char* s, size_t len) {
-   return farmhashmk::Hash32(s, len);
- }
-+#endif // DEVICE_MODIFIER
- 
- // Fingerprint function for a byte array.
--uint64_t Fingerprint64(const char* s, size_t len) {
--  return farmhashna::Hash64(s, len);
-+DEVICE_MODIFIER uint64_t Fingerprint64(const char* s, size_t len) {
-+  return farmhashna_gpu::Hash64(s, len);
- }
- 
-+#ifndef DEVICE_MODIFIER
- // Fingerprint function for a byte array.
- uint128_t Fingerprint128(const char* s, size_t len) {
-   return farmhashcc::Fingerprint128(s, len);
-@@ -1961,9 +2012,11 @@ uint128_t Fingerprint128(const char* s, size_t len) {
- 
- // Older and still available but perhaps not as fast as the above:
- //   farmhashns::Hash32{,WithSeed}()
-+#endif // DEVICE_MODIFIER
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
-+#ifndef DEVICE_MODIFIER
- #if FARMHASHSELFTEST
- 
- #ifndef FARMHASH_SELF_TEST_GUARD
-@@ -11829,3 +11882,5 @@ int main() {
- }
- 
- #endif  // FARMHASHSELFTEST
-+#endif // DEVICE_MODIFIER
-+#endif // FARM_HASH_GPU_H_
--- 
-2.17.1
-
diff --git a/third_party/xla/third_party/farmhash/workspace.bzl b/third_party/xla/third_party/farmhash/workspace.bzl
deleted file mode 100644
index f273313891d365..00000000000000
--- a/third_party/xla/third_party/farmhash/workspace.bzl
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Provides the repository macro to import farmhash."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports farmhash."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    FARMHASH_COMMIT = "0d859a811870d10f53a594927d0d0b97573ad06d"
-    FARMHASH_SHA256 = "18392cf0736e1d62ecbb8d695c31496b6507859e8c75541d7ad0ba092dc52115"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/farmhash.cmake)
-
-    tf_http_archive(
-        name = "farmhash_archive",
-        build_file = "//third_party/farmhash:farmhash.BUILD",
-        sha256 = FARMHASH_SHA256,
-        strip_prefix = "farmhash-{commit}".format(commit = FARMHASH_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/farmhash/archive/{commit}.tar.gz".format(commit = FARMHASH_COMMIT)),
-    )
-
-    tf_http_archive(
-        name = "farmhash_gpu_archive",
-        build_file = "//third_party/farmhash:farmhash_gpu.BUILD",
-        patch_file = ["//third_party/farmhash:farmhash_support_cuda.patch"],
-        sha256 = FARMHASH_SHA256,
-        strip_prefix = "farmhash-{commit}".format(commit = FARMHASH_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/farmhash/archive/{commit}.tar.gz".format(commit = FARMHASH_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/gemmlowp/BUILD b/third_party/xla/third_party/gemmlowp/BUILD
deleted file mode 100644
index 3c413807167aeb..00000000000000
--- a/third_party/xla/third_party/gemmlowp/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/gemmlowp/workspace.bzl b/third_party/xla/third_party/gemmlowp/workspace.bzl
deleted file mode 100644
index b98035569852e2..00000000000000
--- a/third_party/xla/third_party/gemmlowp/workspace.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Provides the repository macro to import gemmlowp."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports gemmlowp."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    GEMMLOWP_COMMIT = "e844ffd17118c1e17d94e1ba4354c075a4577b88"
-    GEMMLOWP_SHA256 = "522b7a82d920ebd0c4408a5365866a40b81d1c0d60b2369011d315cca03c6476"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/gemmlowp.cmake)
-
-    tf_http_archive(
-        name = "gemmlowp",
-        sha256 = GEMMLOWP_SHA256,
-        strip_prefix = "gemmlowp-{commit}".format(commit = GEMMLOWP_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/gemmlowp/archive/{commit}.zip".format(commit = GEMMLOWP_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/gif.BUILD b/third_party/xla/third_party/gif.BUILD
deleted file mode 100644
index 51621ba953e6e2..00000000000000
--- a/third_party/xla/third_party/gif.BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-# Description:
-#   A library for decoding and encoding GIF images
-
-licenses(["notice"])  # MIT
-
-exports_files(["COPYING"])
-
-cc_library(
-    name = "gif",
-    srcs = [
-        "dgif_lib.c",
-        "egif_lib.c",
-        "gif_err.c",
-        "gif_font.c",
-        "gif_hash.c",
-        "gif_hash.h",
-        "gif_lib_private.h",
-        "gifalloc.c",
-        "openbsd-reallocarray.c",
-        "quantize.c",
-    ],
-    hdrs = ["gif_lib.h"],
-    defines = select({
-        ":android": [
-            "S_IREAD=S_IRUSR",
-            "S_IWRITE=S_IWUSR",
-            "S_IEXEC=S_IXUSR",
-        ],
-        "//conditions:default": [],
-    }),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":windows": [":windows_polyfill"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "windows_polyfill",
-    hdrs = ["windows/unistd.h"],
-    includes = ["windows"],
-)
-
-genrule(
-    name = "windows_unistd_h",
-    outs = ["windows/unistd.h"],
-    cmd = "touch $@",
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/xla/third_party/gif_fix_strtok_r.patch b/third_party/xla/third_party/gif_fix_strtok_r.patch
deleted file mode 100644
index c9c9c30c41fab9..00000000000000
--- a/third_party/xla/third_party/gif_fix_strtok_r.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff -r -u ./fixed_gif_font.c ./gif_font.c
---- ./fixed_gif_font.c	2019-09-05 11:05:25.009598262 -0700
-+++ ./gif_font.c	2019-09-05 10:52:45.308389085 -0700
-@@ -11,6 +11,11 @@
-
- #include "gif_lib.h"
-
-+// Windows doesn't have strtok_r.
-+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-+#define strtok_r strtok_s
-+#endif
-+
- /*****************************************************************************
-  Ascii 8 by 8 regular font - only first 128 characters are supported.
- *****************************************************************************/
diff --git a/third_party/xla/third_party/git/BUILD b/third_party/xla/third_party/git/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/git/BUILD.tpl b/third_party/xla/third_party/git/BUILD.tpl
deleted file mode 100644
index 7b031e74d58207..00000000000000
--- a/third_party/xla/third_party/git/BUILD.tpl
+++ /dev/null
@@ -1,10 +0,0 @@
-# Description:
-# Exports generated files used to generate tensorflow/core/util/version_info.cc
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])
-
-exports_files(
-    glob(["gen/*"]),
-)
diff --git a/third_party/xla/third_party/git/git_configure.bzl b/third_party/xla/third_party/git/git_configure.bzl
deleted file mode 100644
index d74f30031cfce9..00000000000000
--- a/third_party/xla/third_party/git/git_configure.bzl
+++ /dev/null
@@ -1,71 +0,0 @@
-"""Repository rule for Git autoconfiguration.
-
-`git_configure` depends on the following environment variables:
-
-  * `PYTHON_BIN_PATH`: location of python binary.
-"""
-
-_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-
-def _fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def _get_python_bin(repository_ctx):
-    """Gets the python bin path."""
-    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-    if python_bin != None:
-        return python_bin
-    python_bin_path = repository_ctx.which("python3")
-    if python_bin_path != None:
-        return str(python_bin_path)
-    python_bin_path = repository_ctx.which("python")
-    if python_bin_path != None:
-        return str(python_bin_path)
-    _fail("Cannot find python in PATH, please make sure " +
-          "python is installed and add its directory in PATH, or --define " +
-          "%s='/something/else'.\nPATH=%s" % (
-              _PYTHON_BIN_PATH,
-              repository_ctx.os.environ.get("PATH", ""),
-          ))
-
-def _git_conf_impl(repository_ctx):
-    repository_ctx.template(
-        "BUILD",
-        Label("//third_party/git:BUILD.tpl"),
-    )
-
-    tensorflow_root_path = str(repository_ctx.path(
-        Label("@local_xla//:BUILD"),
-    ))[:-len("BUILD")]
-    python_script_path = repository_ctx.path(
-        Label("@local_xla//tensorflow/tools/git:gen_git_source.py"),
-    )
-    generated_files_path = repository_ctx.path("gen")
-
-    r = repository_ctx.execute(
-        ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path],
-    )
-    if r.return_code == 0:
-        unused_var = repository_ctx.path(Label("//:.git/HEAD"))  # pylint: disable=unused-variable
-
-    result = repository_ctx.execute([
-        _get_python_bin(repository_ctx),
-        python_script_path,
-        "--configure",
-        tensorflow_root_path,
-        "--gen_root_path",
-        generated_files_path,
-    ], quiet = False)
-
-    if not result.return_code == 0:
-        _fail(result.stderr)
-
-git_configure = repository_rule(
-    implementation = _git_conf_impl,
-    environ = [
-        _PYTHON_BIN_PATH,
-    ],
-)
diff --git a/third_party/xla/third_party/gpus/BUILD b/third_party/xla/third_party/gpus/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/gpus/check_cuda_libs.py b/third_party/xla/third_party/gpus/check_cuda_libs.py
deleted file mode 100644
index b7d98ef2581157..00000000000000
--- a/third_party/xla/third_party/gpus/check_cuda_libs.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Verifies that a list of libraries is installed on the system.
-
-Takes a list of arguments with every two subsequent arguments being a logical
-tuple of (path, check_soname). The path to the library and either True or False
-to indicate whether to check the soname field on the shared library.
-
-Example Usage:
-./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False
-"""
-import os
-import os.path
-import subprocess
-import sys
-
-# pylint: disable=g-import-not-at-top,g-importing-member
-try:
-  from shutil import which
-except ImportError:
-  from distutils.spawn import find_executable as which
-# pylint: enable=g-import-not-at-top,g-importing-member
-
-
-class ConfigError(Exception):
-  pass
-
-
-def check_cuda_lib(path, check_soname=True):
-  """Tests if a library exists on disk and whether its soname matches the filename.
-
-  Args:
-    path: the path to the library.
-    check_soname: whether to check the soname as well.
-
-  Raises:
-    ConfigError: If the library does not exist or if its soname does not match
-    the filename.
-  """
-  if not os.path.isfile(path):
-    raise ConfigError("No library found under: " + path)
-  objdump = which("objdump")
-  if check_soname and objdump is not None:
-    # Decode is necessary as in py3 the return type changed from str to bytes
-    output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
-    output = [line for line in output.splitlines() if "SONAME" in line]
-    sonames = [line.strip().split(" ")[-1] for line in output]
-    if not any(soname == os.path.basename(path) for soname in sonames):
-      raise ConfigError("None of the libraries match their SONAME: " + path)
-
-
-def main():
-  try:
-    args = [argv for argv in sys.argv[1:]]
-    if len(args) % 2 == 1:
-      raise ConfigError("Expected even number of arguments")
-    checked_paths = []
-    for i in range(0, len(args), 2):
-      path = args[i]
-      check_cuda_lib(path, check_soname=args[i + 1] == "True")
-      checked_paths.append(path)
-    # pylint: disable=superfluous-parens
-    print(os.linesep.join(checked_paths))
-    # pylint: enable=superfluous-parens
-  except ConfigError as e:
-    sys.stderr.write(str(e))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/xla/third_party/gpus/crosstool/BUILD b/third_party/xla/third_party/gpus/crosstool/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/gpus/crosstool/BUILD.rocm.tpl b/third_party/xla/third_party/gpus/crosstool/BUILD.rocm.tpl
deleted file mode 100644
index a742cfcd208ec1..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/BUILD.rocm.tpl
+++ /dev/null
@@ -1,118 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    target_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
-    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
-    as_files = ":crosstool_wrapper_driver_is_not_gcc",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_identifier = "local_linux",
-    toolchain_config = ":cc-compiler-local-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    compiler = "compiler",
-    toolchain_identifier = "local_linux",
-    host_system_name = "local",
-    target_system_name = "local",
-    target_libc = "local",
-    abi_version = "local",
-    abi_libc_version = "local",
-    cxx_builtin_include_directories = [%{cxx_builtin_include_directories}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    compile_flags = [
-        "-U_FORTIFY_SOURCE",
-        "-fstack-protector",
-        "-Wall",
-        "-Wunused-but-set-parameter",
-        "-Wno-free-nonheap-object",
-        "-fno-omit-frame-pointer",
-    ],
-    opt_compile_flags = [
-        "-g0",
-        "-O2",
-        "-D_FORTIFY_SOURCE=1",
-        "-DNDEBUG",
-        "-ffunction-sections",
-        "-fdata-sections",
-    ],
-    dbg_compile_flags = ["-g"],
-    cxx_flags = ["-std=c++14"],
-    link_flags = [
-        "-fuse-ld=gold",
-        "-Wl,-no-as-needed",
-        "-Wl,-z,relro,-z,now",
-        "-pass-exit-codes",
-        "-lstdc++",
-        "-lm",
-    ],
-    link_libs = [],
-    opt_link_flags = [],
-    unfiltered_compile_flags = [
-        "-fno-canonical-system-headers",
-        "-Wno-builtin-macro-redefined",
-        "-D__DATE__=\"redacted\"",
-        "-D__TIMESTAMP__=\"redacted\"",
-        "-D__TIME__=\"redacted\"",
-    ] + [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-    coverage_compile_flags = ["--coverage"],
-    coverage_link_flags = ["--coverage"],
-    supports_start_end_lib = True,
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
diff --git a/third_party/xla/third_party/gpus/crosstool/BUILD.tpl b/third_party/xla/third_party/gpus/crosstool/BUILD.tpl
deleted file mode 100644
index 8eda7a1cf6ac2b..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/BUILD.tpl
+++ /dev/null
@@ -1,144 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    target_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = "%{compiler_deps}",
-    compiler_files = "%{compiler_deps}",
-    ar_files = "%{compiler_deps}",
-    as_files = "%{compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_identifier = "local_linux",
-    toolchain_config = ":cc-compiler-local-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    host_compiler_warnings = [%{host_compiler_warnings}],
-    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-    builtin_sysroot = "%{builtin_sysroot}",
-    cuda_path = "%{cuda_toolkit_path}",
-    compiler = "%{compiler}",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = "%{compiler_deps}",
-    compiler_files = "%{compiler_deps}",
-    ar_files = "%{compiler_deps}",
-    as_files = "%{compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_identifier = "local_darwin",
-    toolchain_config = ":cc-compiler-local-darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    cpu = "darwin",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    host_compiler_warnings = [%{host_compiler_warnings}],
-    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = "%{win_compiler_deps}",
-    compiler_files = "%{win_compiler_deps}",
-    ar_files = "%{win_compiler_deps}",
-    as_files = "%{win_compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{win_compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_identifier = "local_windows",
-    toolchain_config = ":cc-compiler-windows-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    cpu = "x64_windows",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    msvc_cl_path = "%{msvc_cl_path}",
-    msvc_env_include = "%{msvc_env_include}",
-    msvc_env_lib = "%{msvc_env_lib}",
-    msvc_env_path = "%{msvc_env_path}",
-    msvc_env_tmp = "%{msvc_env_tmp}",
-    msvc_lib_path = "%{msvc_lib_path}",
-    msvc_link_path = "%{msvc_link_path}",
-    msvc_ml_path = "%{msvc_ml_path}",
-    compiler = "msvc",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/xla/third_party/gpus/crosstool/LICENSE b/third_party/xla/third_party/gpus/crosstool/LICENSE
deleted file mode 100644
index d3da228420e973..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2015, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
deleted file mode 100644
index ffa305c772e881..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,1085 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "artifact_name_pattern",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-def all_assembly_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_compile_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_c_compile_actions():
-    return [
-        ACTION_NAMES.c_compile,
-    ]
-
-def all_cpp_compile_actions():
-    return [
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-    ]
-
-def all_preprocessed_actions():
-    return [
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_executable_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-    ]
-
-def all_shared_library_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_archive_actions():
-    return [ACTION_NAMES.cpp_link_static_library]
-
-def all_strip_actions():
-    return [ACTION_NAMES.strip]
-
-def _library_to_link(flag_prefix, value, iterate = None):
-    return flag_group(
-        flags = [
-            "{}%{{libraries_to_link.{}}}".format(
-                flag_prefix,
-                iterate if iterate else "name",
-            ),
-        ],
-        iterate_over = ("libraries_to_link." + iterate if iterate else None),
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = value,
-        ),
-    )
-
-def _surround_static_library(prefix, suffix):
-    return [
-        flag_group(
-            flags = [prefix, "%{libraries_to_link.name}", suffix],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _prefix_static_library(prefix):
-    return [
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = [prefix + "%{libraries_to_link.name}"],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
-    if alwayslink_suffix:
-        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
-    else:
-        flag_groups = _prefix_static_library(alwayslink_prefix)
-    return flag_group(
-        flag_groups = flag_groups,
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = "static_library",
-        ),
-    )
-
-def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
-    return flag_group(
-        iterate_over = iterate_over,
-        expand_if_available = iterate_over,
-        flag_groups = flag_groups,
-        flags = flags,
-    )
-
-def _libraries_to_link_group(flavour):
-    if flavour == "linux":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                flag_group(
-                    flags = ["-Wl,--start-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file_group", "object_files"),
-                flag_group(
-                    flags = ["-Wl,--end-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
-                _library_to_link("-l", "dynamic_library"),
-                _library_to_link("-l:", "versioned_dynamic_library"),
-            ],
-        )
-    elif flavour == "darwin":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                _library_to_link("", "object_file_group", "object_files"),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("-Wl,-force_load,"),
-                _library_to_link("-l", "dynamic_library"),
-                _library_to_link("-l:", "versioned_dynamic_library"),
-            ],
-        )
-    elif flavour == "msvc":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                _library_to_link("", "object_file_group", "object_files"),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("/WHOLEARCHIVE:"),
-            ],
-        )
-
-def _action_configs_with_tool(path, actions):
-    return [
-        action_config(
-            action_name = name,
-            enabled = True,
-            tools = [tool(path = path)],
-        )
-        for name in actions
-    ]
-
-def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
-    return _action_configs_with_tool(
-        assembly_path,
-        all_assembly_actions(),
-    ) + _action_configs_with_tool(
-        c_compiler_path,
-        all_c_compile_actions(),
-    ) + _action_configs_with_tool(
-        cc_compiler_path,
-        all_cpp_compile_actions(),
-    ) + _action_configs_with_tool(
-        archiver_path,
-        all_archive_actions(),
-    ) + _action_configs_with_tool(
-        linker_path,
-        all_link_actions(),
-    ) + _action_configs_with_tool(
-        strip_path,
-        all_strip_actions(),
-    )
-
-def _tool_paths(cpu, ctx):
-    if cpu in ["local", "darwin"]:
-        return [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
-                "/ar" if cpu == "local" else "/libtool"
-            )),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif cpu == "x64_windows":
-        return [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    else:
-        fail("Unreachable")
-
-def _sysroot_group():
-    return flag_group(
-        flags = ["--sysroot=%{sysroot}"],
-        expand_if_available = "sysroot",
-    )
-
-def _no_canonical_prefixes_group(extra_flags):
-    return flag_group(
-        flags = [
-            "-no-canonical-prefixes",
-        ] + extra_flags,
-    )
-
-def _cuda_set(cuda_path, actions):
-    if cuda_path:
-        return [flag_set(
-            actions = actions,
-            flag_groups = [
-                flag_group(
-                    flags = ["--cuda-path=" + cuda_path],
-                ),
-            ],
-        )]
-    else:
-        return []
-
-def _nologo():
-    return flag_group(flags = ["/nologo"])
-
-def _features(cpu, compiler, ctx):
-    if cpu in ["local", "darwin"]:
-        return [
-            feature(name = "no_legacy_features"),
-            feature(
-                name = "all_compile_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-MD", "-MF", "%{dependency_file}"],
-                                expand_if_available = "dependency_file",
-                            ),
-                            flag_group(
-                                flags = ["-gsplit-dwarf"],
-                                expand_if_available = "per_object_debug_info_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-frandom-seed=%{output_file}"],
-                                expand_if_available = "output_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-D%{preprocessor_defines}"],
-                                iterate_over = "preprocessor_defines",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-include", "%{includes}"],
-                                iterate_over = "includes",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-iquote", "%{quote_include_paths}"],
-                                iterate_over = "quote_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-I%{include_paths}"],
-                                iterate_over = "include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-isystem", "%{system_include_paths}"],
-                                iterate_over = "system_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-F", "%{framework_include_paths}"],
-                                iterate_over = "framework_include_paths",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_cpp_compile_actions(),
-                        flag_groups = [
-                            flag_group(flags = [
-                                "-fmerge-all-constants",
-                            ]),
-                        ] if compiler == "clang" else [],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wno-builtin-macro-redefined",
-                                    "-D__DATE__=\"redacted\"",
-                                    "-D__TIMESTAMP__=\"redacted\"",
-                                    "-D__TIME__=\"redacted\"",
-                                ],
-                            ),
-                            flag_group(
-                                flags = ["-fPIC"],
-                                expand_if_available = "pic",
-                            ),
-                            flag_group(
-                                flags = ["-fPIE"],
-                                expand_if_not_available = "pic",
-                            ),
-                            flag_group(
-                                flags = [
-                                    "-U_FORTIFY_SOURCE",
-                                    "-D_FORTIFY_SOURCE=1",
-                                    "-fstack-protector",
-                                    "-Wall",
-                                ] + ctx.attr.host_compiler_warnings + [
-                                    "-fno-omit-frame-pointer",
-                                ],
-                            ),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-                        with_features = [with_feature_set(features = ["disable-assertions"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-g0",
-                                    "-O2",
-                                    "-ffunction-sections",
-                                    "-fdata-sections",
-                                ],
-                            ),
-                        ],
-                        with_features = [with_feature_set(features = ["opt"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-g"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                ] + _cuda_set(
-                    ctx.attr.cuda_path,
-                    all_compile_actions(),
-                ) + [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            _iterate_flag_group(
-                                flags = ["%{user_compile_flags}"],
-                                iterate_over = "user_compile_flags",
-                            ),
-                            _sysroot_group(),
-                            flag_group(
-                                expand_if_available = "source_file",
-                                flags = ["-c", "%{source_file}"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_assembly_file",
-                                flags = ["-S"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_preprocess_file",
-                                flags = ["-E"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_file",
-                                flags = ["-o", "%{output_file}"],
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_archive_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_archive_actions(),
-                        flag_groups = [
-                            flag_group(
-                                expand_if_available = "linker_param_file",
-                                flags = ["@%{linker_param_file}"],
-                            ),
-                            flag_group(flags = ["rcsD"]),
-                            flag_group(
-                                flags = ["%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            flag_group(
-                                iterate_over = "libraries_to_link",
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file",
-                                        ),
-                                    ),
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.object_files}"],
-                                        iterate_over = "libraries_to_link.object_files",
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file_group",
-                                        ),
-                                    ),
-                                ],
-                                expand_if_available = "libraries_to_link",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_link_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [flag_group(flags = ["-shared"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = ([
-                            flag_group(flags = ["-Wl,-no-as-needed"])
-                        ] if cpu == "local" else []) + ([
-                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
-                        ] if ctx.attr.linker_bin_path else []) + [
-                            flag_group(
-                                flags = ["@%{linker_param_file}"],
-                                expand_if_available = "linker_param_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["%{linkstamp_paths}"],
-                                iterate_over = "linkstamp_paths",
-                            ),
-                            flag_group(
-                                flags = ["-o", "%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-L%{library_search_directories}"],
-                                iterate_over = "library_search_directories",
-                            ),
-                            _iterate_flag_group(
-                                iterate_over = "runtime_library_search_directories",
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ] if cpu == "local" else [
-                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
-                                ],
-                            ),
-                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
-                            _iterate_flag_group(
-                                flags = ["%{user_link_flags}"],
-                                iterate_over = "user_link_flags",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,--gdb-index"],
-                                expand_if_available = "is_using_fission",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-S"],
-                                expand_if_available = "strip_debug_symbols",
-                            ),
-                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_executable_link_actions(),
-                        flag_groups = [flag_group(flags = ["-pie"])],
-                    ),
-                ] + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = [
-                            "-Wl,-z,relro,-z,now",
-                        ])],
-                    ),
-                ] if cpu == "local" else []) + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["-Wl,--gc-sections"]),
-                            flag_group(
-                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                            ),
-                        ],
-                    ),
-                ] if cpu == "local" else []) + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-                    ),
-                ] if cpu == "darwin" else []) + _cuda_set(
-                    ctx.attr.cuda_path,
-                    all_link_actions(),
-                ) + [
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            _sysroot_group(),
-                        ],
-                    ),
-                ],
-            ),
-            feature(name = "disable-assertions"),
-            feature(
-                name = "opt",
-                implies = ["disable-assertions"],
-            ),
-            feature(name = "fastbuild"),
-            feature(name = "dbg"),
-            feature(name = "supports_dynamic_linker", enabled = True),
-            feature(name = "pic", enabled = True),
-            feature(name = "supports_pic", enabled = True),
-            feature(name = "has_configured_linker_path", enabled = True),
-        ]
-    elif cpu == "x64_windows":
-        return [
-            feature(name = "compiler_param_file"),
-            feature(name = "no_legacy_features"),
-            feature(
-                name = "common_flags",
-                enabled = True,
-                env_sets = [
-                    env_set(
-                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
-                        env_entries = [
-                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
-                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_compile_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            flag_group(
-                                flags = [
-                                    "/DCOMPILER_MSVC",
-                                    "/DNOMINMAX",
-                                    "/D_WIN32_WINNT=0x0600",
-                                    "/D_CRT_SECURE_NO_DEPRECATE",
-                                    "/D_CRT_SECURE_NO_WARNINGS",
-                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                                    "/bigobj",
-                                    "/Zm500",
-                                    "/J",
-                                    "/Gy",
-                                    "/GF",
-                                    "/EHsc",
-                                    "/wd4351",
-                                    "/wd4291",
-                                    "/wd4250",
-                                    "/wd4996",
-                                ],
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{quote_include_paths}"],
-                                iterate_over = "quote_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{include_paths}"],
-                                iterate_over = "include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{system_include_paths}"],
-                                iterate_over = "system_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/D%{preprocessor_defines}"],
-                                iterate_over = "preprocessor_defines",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [flag_group(flags = ["/showIncludes"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MT"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MD"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MTd"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MDd"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                        with_features = [with_feature_set(features = ["fastbuild"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                        with_features = [with_feature_set(features = ["opt"])],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            _iterate_flag_group(
-                                flags = ["%{user_compile_flags}"],
-                                iterate_over = "user_compile_flags",
-                            ),
-                        ] + ([
-                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
-                        ] if ctx.attr.host_unfiltered_compile_flags else []),
-                    ),
-                    flag_set(
-                        actions = [ACTION_NAMES.assemble],
-                        flag_groups = [
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fo%{output_file}", "/Zi"],
-                                        expand_if_not_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fo%{output_file}"],
-                                        expand_if_not_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fa%{output_file}"],
-                                        expand_if_available = "output_assembly_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/P", "/Fi%{output_file}"],
-                                        expand_if_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/c", "%{source_file}"],
-                                expand_if_available = "source_file",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_archive_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_archive_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            flag_group(
-                                flags = ["/OUT:%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_link_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DLL"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            _iterate_flag_group(
-                                flags = ["%{linkstamp_paths}"],
-                                iterate_over = "linkstamp_paths",
-                            ),
-                            flag_group(
-                                flags = ["/OUT:%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/IMPLIB:%{interface_library_output_path}"],
-                                expand_if_available = "interface_library_output_path",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions() +
-                                  all_archive_actions(),
-                        flag_groups = [
-                            _libraries_to_link_group("msvc"),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
-                            _iterate_flag_group(
-                                flags = ["%{user_link_flags}"],
-                                iterate_over = "user_link_flags",
-                            ),
-                            flag_group(flags = ["/MACHINE:X64"]),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions() +
-                                  all_archive_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["@%{linker_param_file}"],
-                                expand_if_available = "linker_param_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                        ],
-                        with_features = [with_feature_set(features = ["fastbuild"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                                expand_if_available = "def_file_path",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(name = "parse_showincludes", enabled = True),
-            feature(name = "no_stripping", enabled = True),
-            feature(
-                name = "targets_windows",
-                enabled = True,
-                implies = ["copy_dynamic_libraries_to_binary"],
-            ),
-            feature(name = "copy_dynamic_libraries_to_binary"),
-            feature(
-                name = "generate_pdb_file",
-                requires = [
-                    feature_set(features = ["dbg"]),
-                    feature_set(features = ["fastbuild"]),
-                ],
-            ),
-            feature(name = "static_link_msvcrt"),
-            feature(
-                name = "static_link_msvcrt_no_debug",
-                requires = [
-                    feature_set(features = ["fastbuild"]),
-                    feature_set(features = ["opt"]),
-                ],
-            ),
-            feature(
-                name = "dynamic_link_msvcrt_no_debug",
-                requires = [
-                    feature_set(features = ["fastbuild"]),
-                    feature_set(features = ["opt"]),
-                ],
-            ),
-            feature(
-                name = "static_link_msvcrt_debug",
-                requires = [feature_set(features = ["dbg"])],
-            ),
-            feature(
-                name = "dynamic_link_msvcrt_debug",
-                requires = [feature_set(features = ["dbg"])],
-            ),
-            feature(
-                name = "dbg",
-                implies = ["generate_pdb_file"],
-            ),
-            feature(
-                name = "fastbuild",
-                implies = ["generate_pdb_file"],
-            ),
-            feature(
-                name = "opt",
-            ),
-            feature(name = "windows_export_all_symbols"),
-            feature(name = "no_windows_export_all_symbols"),
-            feature(name = "supports_dynamic_linker", enabled = True),
-            feature(
-                name = "supports_interface_shared_libraries",
-                enabled = True,
-            ),
-            feature(name = "has_configured_linker_path", enabled = True),
-        ]
-    else:
-        fail("Unreachable")
-
-def _impl(ctx):
-    cpu = ctx.attr.cpu
-    compiler = ctx.attr.compiler
-
-    if (cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-        target_cpu = "darwin"
-        target_libc = "macosx"
-        compiler = "compiler"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.host_compiler_path,
-            c_compiler_path = ctx.attr.host_compiler_path,
-            cc_compiler_path = ctx.attr.host_compiler_path,
-            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
-            linker_path = ctx.attr.host_compiler_path,
-            strip_path = ctx.attr.host_compiler_prefix + "/strip",
-        )
-        artifact_name_patterns = []
-    elif (cpu == "local"):
-        toolchain_identifier = "local_linux"
-        target_cpu = "local"
-        target_libc = "local"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.host_compiler_path,
-            c_compiler_path = ctx.attr.host_compiler_path,
-            cc_compiler_path = ctx.attr.host_compiler_path,
-            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
-            linker_path = ctx.attr.host_compiler_path,
-            strip_path = ctx.attr.host_compiler_prefix + "/strip",
-        )
-        artifact_name_patterns = []
-    elif (cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-        target_cpu = "x64_windows"
-        target_libc = "msvcrt"
-        compiler = "msvc-cl"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.msvc_ml_path,
-            c_compiler_path = ctx.attr.msvc_cl_path,
-            cc_compiler_path = ctx.attr.msvc_cl_path,
-            archiver_path = ctx.attr.msvc_lib_path,
-            linker_path = ctx.attr.msvc_link_path,
-            strip_path = "fake_tool_strip_not_supported",
-        )
-        artifact_name_patterns = [
-            artifact_name_pattern(
-                category_name = "object_file",
-                prefix = "",
-                extension = ".obj",
-            ),
-            artifact_name_pattern(
-                category_name = "static_library",
-                prefix = "",
-                extension = ".lib",
-            ),
-            artifact_name_pattern(
-                category_name = "alwayslink_static_library",
-                prefix = "",
-                extension = ".lo.lib",
-            ),
-            artifact_name_pattern(
-                category_name = "executable",
-                prefix = "",
-                extension = ".exe",
-            ),
-            artifact_name_pattern(
-                category_name = "dynamic_library",
-                prefix = "",
-                extension = ".dll",
-            ),
-            artifact_name_pattern(
-                category_name = "interface_library",
-                prefix = "",
-                extension = ".if.lib",
-            ),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = _features(cpu, compiler, ctx),
-            action_configs = action_configs,
-            artifact_name_patterns = artifact_name_patterns,
-            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = "local",
-            target_system_name = "local",
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = "local",
-            abi_libc_version = "local",
-            tool_paths = _tool_paths(cpu, ctx),
-            make_variables = [],
-            builtin_sysroot = ctx.attr.builtin_sysroot,
-            cc_target_os = None,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "builtin_sysroot": attr.string(),
-        "cuda_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
deleted file mode 100755
index 81e54ad431fccf..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ /dev/null
@@ -1,306 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
-
-NVCC_PATH = '%{nvcc_path}'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '%{cuda_version}'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, with the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument(option, nargs='*', action='append')
-  option = option.lstrip('-').replace('-', '_')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-def system(cmd):
-  """Invokes cmd with os.system().
-
-  Args:
-    cmd: The command.
-
-  Returns:
-    The exit code if the process exited with exit() or -signal
-    if the process was terminated by a signal.
-  """
-  retv = os.system(cmd)
-  if os.WIFEXITED(retv):
-    return os.WEXITSTATUS(retv)
-  else:
-    return -os.WTERMSIG(retv)
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, '-O')
-  m_options = GetOptionValue(argv, '-m')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  m_host_options = ''.join([' -m' + m for m in m_options if m not in ['32', '64']])
-  host_compiler_options = ' '.join([host_compiler_options, m_host_options])
-  include_options = GetOptionValue(argv, '-I')
-  out_file = GetOptionValue(argv, '-o')
-  depfiles = GetOptionValue(argv, '-MF')
-  defines = GetOptionValue(argv, '-D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, '-U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, '-std')
-  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
-  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
-  nvcc_std_map = {}
-  if int(NVCC_VERSION.split('.')[0]) >= 11:
-      nvcc_std_map["c++1z"] = "c++17"
-      nvcc_allowed_std_options += ["c++17", "c++1z"]
-  std_options = ''.join([' -std=' +
-      (nvcc_std_map[define] if define in nvcc_std_map else define)
-      for define in std_options if define in nvcc_allowed_std_options][-1:])
-  fatbin_options = ''.join([' --fatbin-options=' + option
-      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, '-c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  nvccopts = '-D_FORCE_INLINES '
-  capabilities_sm = set(GetOptionValue(argv, "--cuda-gpu-arch"))
-  capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx'))
-  # When both "code=sm_xy" and "code=compute_xy" are requested for a single
-  # arch, they can be combined using "code=xy,compute_xy" which avoids a
-  # redundant PTX generation during compilation.
-  capabilities_both = capabilities_sm.intersection(capabilities_compute)
-  for capability in capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  for capability in capabilities_sm - capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
-                                                               capability)
-  for capability in capabilities_compute - capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
-                                                                    capability)
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-  # Force C++17 dialect (note, everything in just one string!)
-  nvccopts += ' --std c++17 '
-  nvccopts += fatbin_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
deleted file mode 100755
index 8fb22313010a45..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ /dev/null
@@ -1,265 +0,0 @@
-#!/usr/bin/env python
-"""Crosstool wrapper for compiling ROCm programs.
-
-SYNOPSIS:
-  crosstool_wrapper_driver_rocm [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x rocm" is present in the list of arguments passed
-  to this script, it invokes the hipcc compiler. Most arguments are passed
-  as is as a string to --compiler-options of hipcc. When "-x rocm" is not
-  present, this wrapper invokes gcc with the input arguments as is.
-"""
-
-__author__ = 'whchung@gmail.com (Wen-Heng (Jack) Chung)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by rocm_configure.bzl.
-CPU_COMPILER = ('%{cpu_compiler}')
-
-HIPCC_PATH = '%{hipcc_path}'
-HIPCC_ENV = '%{hipcc_env}'
-HIP_RUNTIME_PATH = '%{hip_runtime_path}'
-HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
-ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
-ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
-VERBOSE = '%{crosstool_verbose}'=='1'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to hipcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  #if args.fno_canonical_system_headers:
-  #  opts += ' -fno-canonical-system-headers'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def system(cmd):
-  """Invokes cmd with os.system().
-
-  Args:
-    cmd: The command.
-
-  Returns:
-    The exit code if the process exited with exit() or -signal
-    if the process was terminated by a signal.
-  """
-  retv = os.system(cmd)
-  if os.WIFEXITED(retv):
-    return os.WEXITSTATUS(retv)
-  else:
-    return -os.WTERMSIG(retv)
-
-
-def InvokeHipcc(argv, log=False):
-  """Call hipcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('hipcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  hipcc_allowed_std_options = ["c++11", "c++14", "c++17"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in hipcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  hipccopts = ' '
-  # In hip-clang environment, we need to make sure that hip header is included
-  # before some standard math header like <complex> is included in any source.
-  # Otherwise, we get build error.
-  # Also we need to retain warning about uninitialised shared variable as
-  # warning only, even when -Werror option is specified.
-  hipccopts += ' --include=hip/hip_runtime.h '
-  # Force C++17 dialect (note, everything in just one string!)
-  hipccopts += ' --std=c++17 '
-  # Use -fno-gpu-rdc by default for early GPU kernel finalization
-  # This flag would trigger GPU kernels be generated at compile time, instead
-  # of link time. This allows the default host compiler (gcc) be used as the
-  # linker for TensorFlow on ROCm platform.
-  hipccopts += ' -fno-gpu-rdc '
-  hipccopts += ' -fcuda-flush-denormals-to-zero '
-  hipccopts += undefines
-  hipccopts += defines
-  hipccopts += std_options
-  hipccopts += m_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (HIPCC_PATH + ' ' + hipccopts +
-           host_compiler_options +
-           ' -I .' + includes + ' ' + srcs + ' -M -o ' + depfile)
-    cmd = HIPCC_ENV.replace(';', ' ') + ' ' + cmd
-    if log: Log(cmd)
-    if VERBOSE: print(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (HIPCC_PATH + ' ' + hipccopts +
-         host_compiler_options + ' -fPIC' +
-         ' -I .' + opt + includes + ' -c ' + srcs + out)
-
-  cmd = HIPCC_ENV.replace(';', ' ') + ' '\
-        + cmd
-  if log: Log(cmd)
-  if VERBOSE: print(cmd)
-  return system(cmd)
-
-
-def main():
-  # ignore PWD env var
-  os.environ['PWD']=''
-
-  parser = ArgumentParser(fromfile_prefix_chars='@')
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--rocm_log', action='store_true')
-  parser.add_argument('-pass-exit-codes', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if VERBOSE: print('PWD=' + os.getcwd())
-  if VERBOSE: print('HIPCC_ENV=' + HIPCC_ENV)
-
-  if args.x and args.x[0] == 'rocm':
-    # compilation for GPU objects
-    if args.rocm_log: Log('-x rocm')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.rocm_log: Log('using hipcc')
-    return InvokeHipcc(leftover, log=args.rocm_log)
-
-  elif args.pass_exit_codes:
-    # link
-    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to 
-    # use host compiler as linker, but we have to link with HCC/HIP runtime.
-    # Such restriction would be revised further as the bazel script get
-    # improved to fine tune dependencies to ROCm libraries.
-    gpu_linker_flags = [flag for flag in sys.argv[1:]
-                               if not flag.startswith(('--rocm_log'))]
-
-    gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
-    gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
-    gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
-    gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
-    gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
-    gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
-    gpu_linker_flags.append("-lrt")
-    gpu_linker_flags.append("-lstdc++")
-
-    if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
-    return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
-
-  else:
-    # compilation for host objects
-
-    # Strip our flags before passing through to the CPU compiler for files which
-    # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
-    # We not only want to pass -x to the CPU compiler, but also keep it in its
-    # relative location in the argv list (the compiler is actually sensitive to
-    # this).
-    cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                               if not flag.startswith(('--rocm_log'))]
-
-    # XXX: SE codes need to be built with gcc, but need this macro defined
-    cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
-    if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
-    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
deleted file mode 100644
index e0541defa34687..00000000000000
--- a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,1162 +0,0 @@
-"""cc_toolchain_config rule for configuring ROCm toolchain on Linux."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-all_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-    ACTION_NAMES.lto_backend,
-]
-
-all_cpp_compile_actions = [
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-]
-
-preprocessor_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.clif_match,
-]
-
-codegen_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.lto_backend,
-]
-
-all_link_actions = [
-    ACTION_NAMES.cpp_link_executable,
-    ACTION_NAMES.cpp_link_dynamic_library,
-    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-]
-
-lto_index_actions = [
-    ACTION_NAMES.lto_index_for_executable,
-    ACTION_NAMES.lto_index_for_dynamic_library,
-    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-]
-
-def _impl(ctx):
-    tool_paths = [
-        tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-        tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-        tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-        tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-        tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-        tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-        tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-        tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-        tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-        tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-        tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-    ]
-
-    action_configs = []
-
-    supports_pic_feature = feature(
-        name = "supports_pic",
-        enabled = True,
-    )
-    supports_start_end_lib_feature = feature(
-        name = "supports_start_end_lib",
-        enabled = True,
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.compile_flags,
-                    ),
-                ] if ctx.attr.compile_flags else []),
-            ),
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.dbg_compile_flags,
-                    ),
-                ] if ctx.attr.dbg_compile_flags else []),
-                with_features = [with_feature_set(features = ["dbg"])],
-            ),
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.opt_compile_flags,
-                    ),
-                ] if ctx.attr.opt_compile_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = all_cpp_compile_actions + [ACTION_NAMES.lto_backend],
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.cxx_flags,
-                    ),
-                ] if ctx.attr.cxx_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.link_flags,
-                    ),
-                ] if ctx.attr.link_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.opt_link_flags,
-                    ),
-                ] if ctx.attr.opt_link_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    dbg_feature = feature(name = "dbg")
-
-    opt_feature = feature(name = "opt")
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_optimize_feature = feature(
-        name = "fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.unfiltered_compile_flags,
-                    ),
-                ] if ctx.attr.unfiltered_compile_flags else []),
-            ),
-        ],
-    )
-
-    library_search_directories_feature = feature(
-        name = "library_search_directories",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-L%{library_search_directories}"],
-                        iterate_over = "library_search_directories",
-                        expand_if_available = "library_search_directories",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_libgcc_feature = feature(
-        name = "static_libgcc",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.lto_index_for_executable,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["-static-libgcc"])],
-                with_features = [
-                    with_feature_set(features = ["static_link_cpp_runtimes"]),
-                ],
-            ),
-        ],
-    )
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                ],
-            ),
-        ],
-    )
-
-    per_object_debug_info_feature = feature(
-        name = "per_object_debug_info",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-gsplit-dwarf"],
-                        expand_if_available = "per_object_debug_info_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cs_fdo_optimize_feature = feature(
-        name = "cs_fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.lto_backend],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-Xclang-only=-Wno-profile-instr-unprofiled",
-                            "-Xclang-only=-Wno-profile-instr-out-of-date",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["csprofile"],
-    )
-
-    autofdo_feature = feature(
-        name = "autofdo",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fauto-profile=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    runtime_library_search_directories_feature = feature(
-        name = "runtime_library_search_directories",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "runtime_library_search_directories",
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$EXEC_ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                                expand_if_true = "is_cc_test",
-                            ),
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                                expand_if_false = "is_cc_test",
-                            ),
-                        ],
-                        expand_if_available =
-                            "runtime_library_search_directories",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(features = ["static_link_cpp_runtimes"]),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "runtime_library_search_directories",
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                            ),
-                        ],
-                        expand_if_available =
-                            "runtime_library_search_directories",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        not_features = ["static_link_cpp_runtimes"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fission_support_feature = feature(
-        name = "fission_support",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--gdb-index"],
-                        expand_if_available = "is_using_fission",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["-shared"])],
-            ),
-        ],
-    )
-
-    random_seed_feature = feature(
-        name = "random_seed",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-frandom-seed=%{output_file}"],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    includes_feature = feature(
-        name = "includes",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-include", "%{includes}"],
-                        iterate_over = "includes",
-                        expand_if_available = "includes",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_instrument_feature = feature(
-        name = "fdo_instrument",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-generate=%{fdo_instrument_path}",
-                            "-fno-data-sections",
-                        ],
-                        expand_if_available = "fdo_instrument_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    cs_fdo_instrument_feature = feature(
-        name = "cs_fdo_instrument",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.lto_backend,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fcs-profile-generate=%{cs_fdo_instrument_path}",
-                        ],
-                        expand_if_available = "cs_fdo_instrument_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["csprofile"],
-    )
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-iquote", "%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["-I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["-isystem", "%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    symbol_counts_feature = feature(
-        name = "symbol_counts",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wl,--print-symbol-counts=%{symbol_counts_output}",
-                        ],
-                        expand_if_available = "symbol_counts_output",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    llvm_coverage_map_format_feature = feature(
-        name = "llvm_coverage_map_format",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-instr-generate",
-                            "-fcoverage-mapping",
-                        ],
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions + [
-                    "objc-executable",
-                    "objc++-executable",
-                ],
-                flag_groups = [
-                    flag_group(flags = ["-fprofile-instr-generate"]),
-                ],
-            ),
-        ],
-        requires = [feature_set(features = ["coverage"])],
-        provides = ["profile"],
-    )
-
-    strip_debug_symbols_feature = feature(
-        name = "strip_debug_symbols",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,-S"],
-                        expand_if_available = "strip_debug_symbols",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    build_interface_libraries_feature = feature(
-        name = "build_interface_libraries",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "%{generate_interface_library}",
-                            "%{interface_library_builder_path}",
-                            "%{interface_library_input_path}",
-                            "%{interface_library_output_path}",
-                        ],
-                        expand_if_available = "generate_interface_library",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        features = ["supports_interface_shared_libraries"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    libraries_to_link_feature = feature(
-        name = "libraries_to_link",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-Wl,--start-lib"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-whole-archive"],
-                                expand_if_true =
-                                    "libraries_to_link.is_whole_archive",
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.object_files}"],
-                                iterate_over = "libraries_to_link.object_files",
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-l%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "dynamic_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-l:%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "versioned_dynamic_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-no-whole-archive"],
-                                expand_if_true = "libraries_to_link.is_whole_archive",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,--end-lib"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                    flag_group(
-                        flags = ["-Wl,@%{thinlto_param_file}"],
-                        expand_if_true = "thinlto_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ] + ([flag_group(flags = ctx.attr.link_libs)] if ctx.attr.link_libs else []),
-            ),
-        ],
-    )
-
-    fdo_prefetch_hints_feature = feature(
-        name = "fdo_prefetch_hints",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.lto_backend,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Xclang-only=-mllvm",
-                            "-Xclang-only=-prefetch-hints-file=%{fdo_prefetch_hints_path}",
-                        ],
-                        expand_if_available = "fdo_prefetch_hints_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    gcc_coverage_map_format_feature = feature(
-        name = "gcc_coverage_map_format",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                    "objc-executable",
-                    "objc++-executable",
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-fprofile-arcs", "-ftest-coverage"],
-                        expand_if_available = "gcov_gcno_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [flag_group(flags = ["--coverage"])],
-            ),
-        ],
-        requires = [feature_set(features = ["coverage"])],
-        provides = ["profile"],
-    )
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(flags = ["rcsD"]),
-                    flag_group(
-                        flags = ["%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.object_files}"],
-                                iterate_over = "libraries_to_link.object_files",
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    force_pic_flags_feature = feature(
-        name = "force_pic_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.lto_index_for_executable,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-pie"],
-                        expand_if_available = "force_pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dependency_file_feature = feature(
-        name = "dependency_file",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-MD", "-MF", "%{dependency_file}"],
-                        expand_if_available = "dependency_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_library_linker_tool_path = tool_paths
-    dynamic_library_linker_tool_feature = feature(
-        name = "dynamic_library_linker_tool",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [" + cppLinkDynamicLibraryToolPath + "],
-                        expand_if_available = "generate_interface_library",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        features = ["supports_interface_shared_libraries"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-o", "%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    # Note that we also set --coverage for c++-link-nodeps-dynamic-library. The
-    # generated code contains references to gcov symbols, and the dynamic linker
-    # can't resolve them unless the library is linked against gcov.
-    coverage_feature = feature(
-        name = "coverage",
-        provides = ["profile"],
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = ([
-                    flag_group(flags = ctx.attr.coverage_compile_flags),
-                ] if ctx.attr.coverage_compile_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(flags = ctx.attr.coverage_link_flags),
-                ] if ctx.attr.coverage_link_flags else []),
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_canonical_prefixes_feature = feature(
-        name = "no-canonical-prefixes",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-no-canonical-prefixes",
-                            "-fno-canonical-system-headers",
-                        ]
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    features = [
-        dependency_file_feature,
-        random_seed_feature,
-        pic_feature,
-        per_object_debug_info_feature,
-        preprocessor_defines_feature,
-        includes_feature,
-        include_paths_feature,
-        fdo_instrument_feature,
-        cs_fdo_instrument_feature,
-        cs_fdo_optimize_feature,
-        fdo_prefetch_hints_feature,
-        autofdo_feature,
-        build_interface_libraries_feature,
-        dynamic_library_linker_tool_feature,
-        symbol_counts_feature,
-        shared_flag_feature,
-        linkstamps_feature,
-        output_execpath_flags_feature,
-        runtime_library_search_directories_feature,
-        library_search_directories_feature,
-        archiver_flags_feature,
-        force_pic_flags_feature,
-        fission_support_feature,
-        strip_debug_symbols_feature,
-        coverage_feature,
-        supports_pic_feature,
-    ] + (
-        [
-            supports_start_end_lib_feature,
-        ] if ctx.attr.supports_start_end_lib else []
-    ) + [
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        libraries_to_link_feature,
-        user_link_flags_feature,
-        static_libgcc_feature,
-        fdo_optimize_feature,
-        supports_dynamic_linker_feature,
-        dbg_feature,
-        opt_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-        build_id_feature,
-        no_canonical_prefixes_feature,
-        linker_bin_path_feature,
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
-        toolchain_identifier = ctx.attr.toolchain_identifier,
-        host_system_name = ctx.attr.host_system_name,
-        target_system_name = ctx.attr.target_system_name,
-        target_cpu = ctx.attr.cpu,
-        target_libc = ctx.attr.target_libc,
-        compiler = ctx.attr.compiler,
-        abi_version = ctx.attr.abi_version,
-        abi_libc_version = ctx.attr.abi_libc_version,
-        tool_paths = tool_paths,
-    )
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True),
-        "compiler": attr.string(mandatory = True),
-        "toolchain_identifier": attr.string(mandatory = True),
-        "host_system_name": attr.string(mandatory = True),
-        "target_system_name": attr.string(mandatory = True),
-        "target_libc": attr.string(mandatory = True),
-        "abi_version": attr.string(mandatory = True),
-        "abi_libc_version": attr.string(mandatory = True),
-        "cxx_builtin_include_directories": attr.string_list(),
-        "compile_flags": attr.string_list(),
-        "dbg_compile_flags": attr.string_list(),
-        "opt_compile_flags": attr.string_list(),
-        "cxx_flags": attr.string_list(),
-        "link_flags": attr.string_list(),
-        "link_libs": attr.string_list(),
-        "opt_link_flags": attr.string_list(),
-        "unfiltered_compile_flags": attr.string_list(),
-        "coverage_compile_flags": attr.string_list(),
-        "coverage_link_flags": attr.string_list(),
-        "supports_start_end_lib": attr.bool(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "linker_bin_path": attr.string(),
-    },
-    provides = [CcToolchainConfigInfo],
-)
-
diff --git a/third_party/xla/third_party/gpus/cuda/BUILD b/third_party/xla/third_party/gpus/cuda/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/gpus/cuda/BUILD.tpl b/third_party/xla/third_party/gpus/cuda/BUILD.tpl
deleted file mode 100644
index 700e040a88eeca..00000000000000
--- a/third_party/xla/third_party/gpus/cuda/BUILD.tpl
+++ /dev/null
@@ -1,245 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-# Config setting whether TensorFlow is built with CUDA support using clang.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
-selects.config_setting_group(
-    name = "using_clang",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_clang",
-    ],
-)
-
-# Config setting whether TensorFlow is built with CUDA support using nvcc.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
-selects.config_setting_group(
-    name = "using_nvcc",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_nvcc",
-    ],
-)
-
-# Equivalent to using_clang && -c opt.
-selects.config_setting_group(
-    name = "using_clang_opt",
-    match_all = [
-        ":using_clang",
-        ":_opt",
-    ],
-)
-
-config_setting(
-    name = "_opt",
-    values = {"compilation_mode": "opt"},
-)
-
-# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
-# All clients including TensorFlow should use these directives.
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = [
-        "-ldl",
-        "-lrt",
-        "-lpthread",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = glob(["cuda/lib/libcudart.so.*"]),
-    data = glob(["cuda/lib/libcudart.so.*"]),
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusolver_headers",
-    hdrs = [":cusolver-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusolver/include"],
-    strip_include_prefix = "cusolver/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cufft_headers",
-    hdrs = [":cufft-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cufft/include"],
-    strip_include_prefix = "cufft/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusparse_headers",
-    hdrs = [":cusparse-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusparse/include"],
-    strip_include_prefix = "cusparse/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "curand_headers",
-    hdrs = [":curand-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["curand/include"],
-    strip_include_prefix = "curand/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = glob(["cuda/lib/libcublas.so.*"]),
-    data = glob(["cuda/lib/libcublas.so.*"]),
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cublasLt",
-    srcs = glob(["cuda/lib/libcublasLt.so.*"]),
-    data = glob(["cuda/lib/libcublasLt.so.*"]),
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = glob(["cuda/lib/libcusolver.so.*"]),
-    data = glob(["cuda/lib/libcusolver.so.*"]),
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = glob(["cuda/lib/libcudnn.so.*"]),
-    data = glob(["cuda/lib/libcudnn.so.*"]),
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = glob(["cuda/lib/libcufft.so.*"]),
-    data = glob(["cuda/lib/libcufft.so.*"]),
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = glob(["cuda/lib/libcurand.so.*"]),
-    data = glob(["cuda/lib/libcurand.so.*"]),
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cublasLt",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-alias(
-    name = "cub_headers",
-    actual = ":cuda_headers",
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "nvml_headers",
-    hdrs = [":nvml"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/nvml/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = glob(["cuda/lib/libcupti.so.*"]),
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = glob(["cuda/lib/libcusparse.so.*"]),
-    data = glob(["cuda/lib/libcusparse.so.*"]),
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-py_library(
-    name = "cuda_config_py",
-    srcs = ["cuda/cuda_config.py"],
-)
-
-%{copy_rules}
diff --git a/third_party/xla/third_party/gpus/cuda/LICENSE b/third_party/xla/third_party/gpus/cuda/LICENSE
deleted file mode 100644
index d3da228420e973..00000000000000
--- a/third_party/xla/third_party/gpus/cuda/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2015, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
deleted file mode 100644
index 189d3e3e784003..00000000000000
--- a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
+++ /dev/null
@@ -1,151 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-    """
-    return select({
-        "@local_config_cuda//:is_cuda_enabled": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_cuda_clang(if_true, if_false = []):
-   """Shorthand for select()'ing on wheteher we're building with cuda-clang.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with cuda-clang.  Otherwise, the select statement evaluates to if_false.
-   """
-   return select({
-       "@local_config_cuda//cuda:using_clang": if_true,
-       "//conditions:default": if_false
-   })
-
-def if_cuda_exec(if_true, if_false = []):
-    """Synonym for if_cuda.
-
-    Selects if_true both in target and in exec configurations. In principle,
-    if_cuda would only need to select if_true in a target configuration, but
-    not in an exec configuration, but this is not currently implemented.
-    """
-    return if_cuda(if_true, if_false)
-
-def cuda_compiler(if_cuda_clang, if_nvcc, neither = []):
-    """Shorthand for select()'ing on wheteher we're building with cuda-clang or nvcc.
-
-     Returns a select statement which evaluates to if_cuda_clang if we're building
-     with cuda-clang, if_nvcc if we're building with NVCC.
-     Otherwise, the select statement evaluates to neither.
-
-    """
-    if %{cuda_is_configured}:
-        return select({
-            "@local_config_cuda//cuda:using_clang": if_cuda_clang,
-            "@local_config_cuda//:is_cuda_compiler_nvcc": if_nvcc,
-            "//conditions:default": neither
-        })
-    else:
-        return select({
-            "//conditions:default": neither
-        })
-
-def if_cuda_clang_opt(if_true, if_false = []):
-   """Shorthand for select()'ing on wheteher we're building with cuda-clang
-   in opt mode.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with cuda-clang in opt mode. Otherwise, the select statement evaluates to
-    if_false.
-
-   """
-   return select({
-       "@local_config_cuda//cuda:using_clang_opt": if_true,
-       "//conditions:default": if_false
-   })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda([
-        "-x", "cuda",
-        "-DGOOGLE_CUDA=1",
-    ] + %{cuda_extra_copts}) + if_cuda_clang_opt(
-        # Some important CUDA optimizations are only enabled at O3.
-        ["-O3"]
-    ) + cuda_compiler(
-        if_cuda_clang = [ "-Xcuda-fatbinary", "--compress-all"],
-        if_nvcc = [
-            "-Xcuda-fatbinary=--compress-all",
-            # Ensure that NVCC matches clang's constexpr behavior.
-            "-nvcc_options=expt-relaxed-constexpr"
-        ]
-    )
-
-def cuda_gpu_architectures():
-    """Returns a list of supported GPU architectures."""
-    return %{cuda_gpu_architectures}
-
-def if_cuda_is_configured(x, no_cuda = []):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if %{cuda_is_configured}:
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": no_cuda})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
-
-def cuda_cc_test(copts = [], **kwargs):
-    """Wrapper over cc_test which adds default CUDA options."""
-    native.cc_test(copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]), **kwargs)
-
-EnableCudaInfo = provider()
-
-def _enable_cuda_flag_impl(ctx):
-    value = ctx.build_setting_value
-    if ctx.attr.enable_override:
-        print(
-            "\n\033[1;33mWarning:\033[0m '--define=using_cuda_nvcc' will be " +
-            "unsupported soon. Use '--@local_config_cuda//:enable_cuda' " +
-            "instead."
-        )
-        value = True
-    return EnableCudaInfo(value = value)
-
-enable_cuda_flag = rule(
-    implementation = _enable_cuda_flag_impl,
-    build_setting = config.bool(flag = True),
-    attrs = {"enable_override": attr.bool()},
-)
diff --git a/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl
deleted file mode 100644
index 03ecd0159f496a..00000000000000
--- a/third_party/xla/third_party/gpus/cuda/cuda_config.h.tpl
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_VERSION "%{cuda_version}"
-#define TF_CUDART_VERSION "%{cudart_version}"
-#define TF_CUPTI_VERSION "%{cupti_version}"
-#define TF_CUBLAS_VERSION "%{cublas_version}"
-#define TF_CUSOLVER_VERSION "%{cusolver_version}"
-#define TF_CURAND_VERSION "%{curand_version}"
-#define TF_CUFFT_VERSION "%{cufft_version}"
-#define TF_CUSPARSE_VERSION "%{cusparse_version}"
-#define TF_CUDNN_VERSION "%{cudnn_version}"
-
-#define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
-
-#define TF_CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities}
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/xla/third_party/gpus/cuda/cuda_config.py.tpl b/third_party/xla/third_party/gpus/cuda/cuda_config.py.tpl
deleted file mode 100644
index 3da256b54b42e4..00000000000000
--- a/third_party/xla/third_party/gpus/cuda/cuda_config.py.tpl
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-config = %{cuda_config}
diff --git a/third_party/xla/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/gpus/cuda_configure.bzl
deleted file mode 100644
index ff3e8d689efda7..00000000000000
--- a/third_party/xla/third_party/gpus/cuda_configure.bzl
+++ /dev/null
@@ -1,1224 +0,0 @@
-"""Repository rule for CUDA autoconfiguration.
-
-`cuda_configure` depends on the following environment variables:
-
-  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
-  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
-    both host and device code compilation if TF_CUDA_CLANG is 1.
-  * `TF_SYSROOT`: The sysroot to use when compiling.
-  * `TF_DOWNLOAD_CLANG`: Whether to download a recent release of clang
-    compiler and use it to build tensorflow. When this option is set
-    CLANG_CUDA_COMPILER_PATH is ignored.
-  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
-    `/usr/local/cuda,usr/`.
-  * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
-    use the system default.
-  * `TF_CUDNN_VERSION`: The version of the cuDNN library.
-  * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
-    `3.5,5.2`.
-  * `PYTHON_BIN_PATH`: The python binary path
-"""
-
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "err_out",
-    "execute",
-    "get_bash_bin",
-    "get_host_environ",
-    "get_python_bin",
-    "raw_exec",
-    "read_dir",
-    "realpath",
-    "which",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
-_TF_SYSROOT = "TF_SYSROOT"
-_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
-_TF_CUDA_VERSION = "TF_CUDA_VERSION"
-_TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
-_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-_TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
-_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
-_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
-def verify_build_defines(params):
-    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
-
-    Args:
-      params: dict of variables that will be passed to the BUILD.tpl template.
-    """
-    missing = []
-    for param in [
-        "cxx_builtin_include_directories",
-        "extra_no_canonical_prefixes_flags",
-        "host_compiler_path",
-        "host_compiler_prefix",
-        "host_compiler_warnings",
-        "linker_bin_path",
-        "compiler_deps",
-        "unfiltered_compile_flags",
-    ]:
-        if ("%{" + param + "}") not in params:
-            missing.append(param)
-
-    if missing:
-        auto_configure_fail(
-            "BUILD.tpl template is missing these variables: " +
-            str(missing) +
-            ".\nWe only got: " +
-            str(params) +
-            ".",
-        )
-
-# TODO(dzc): Once these functions have been factored out of Bazel's
-# cc_configure.bzl, load them from @bazel_tools instead.
-# BEGIN cc_configure common functions.
-def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-
-    if _use_cuda_clang(repository_ctx):
-        target_cc_name = "clang"
-        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-            return "extra_tools/bin/clang"
-    else:
-        target_cc_name = "gcc"
-        cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = which(repository_ctx, cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    return [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
-def auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def _cuda_include_path(repository_ctx, cuda_config):
-    """Generates the Starlark string with cuda include directories.
-
-      Args:
-        repository_ctx: The repository context.
-        cc: The path to the gcc host compiler.
-
-      Returns:
-        A list of the gcc host compiler include directories.
-      """
-    nvcc_path = repository_ctx.path(
-        "%s/bin/nvcc" % cuda_config.cuda_toolkit_path,
-    )
-
-    # The expected exit code of this command is non-zero. Bazel remote execution
-    # only caches commands with zero exit code. So force a zero exit code.
-    cmd = "%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]" % str(nvcc_path)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    target_dir = ""
-    for one_line in err_out(result).splitlines():
-        if one_line.startswith("#$ _TARGET_DIR_="):
-            target_dir = (
-                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
-                    "#$ _TARGET_DIR_=",
-                    "",
-                ) + "/include"
-            )
-    inc_entries = []
-    if target_dir != "":
-        inc_entries.append(realpath(repository_ctx, target_dir))
-    inc_entries.append(realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"))
-    return inc_entries
-
-def enable_cuda(repository_ctx):
-    """Returns whether to build with CUDA support."""
-    return int(get_host_environ(repository_ctx, "TF_NEED_CUDA", False))
-
-def matches_version(environ_version, detected_version):
-    """Checks whether the user-specified version matches the detected version.
-
-      This function performs a weak matching so that if the user specifies only
-      the
-      major or major and minor versions, the versions are still considered
-      matching
-      if the version parts match. To illustrate:
-
-          environ_version  detected_version  result
-          -----------------------------------------
-          5.1.3            5.1.3             True
-          5.1              5.1.3             True
-          5                5.1               True
-          5.1.3            5.1               False
-          5.2.3            5.1.3             False
-
-      Args:
-        environ_version: The version specified by the user via environment
-          variables.
-        detected_version: The version autodetected from the CUDA installation on
-          the system.
-      Returns: True if user-specified version matches detected version and False
-        otherwise.
-    """
-    environ_version_parts = environ_version.split(".")
-    detected_version_parts = detected_version.split(".")
-    if len(detected_version_parts) < len(environ_version_parts):
-        return False
-    for i, part in enumerate(detected_version_parts):
-        if i >= len(environ_version_parts):
-            break
-        if part != environ_version_parts[i]:
-            return False
-    return True
-
-def compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities.
-
-    Args:
-      repository_ctx: the repo rule's context.
-    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
-      both PTX and SASS, 'sm_xy' refers to SASS only.
-    """
-    capabilities = get_host_environ(
-        repository_ctx,
-        _TF_CUDA_COMPUTE_CAPABILITIES,
-        "compute_35,compute_52",
-    ).split(",")
-
-    # Map old 'x.y' capabilities to 'compute_xy'.
-    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
-        # If all capabilities are in 'x.y' format, only include PTX for the
-        # highest capability.
-        cc_list = sorted([x.replace(".", "") for x in capabilities])
-        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
-    for i, capability in enumerate(capabilities):
-        parts = capability.split(".")
-        if len(parts) != 2:
-            continue
-        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
-
-    # Make list unique
-    capabilities = dict(zip(capabilities, capabilities)).keys()
-
-    # Validate capabilities.
-    for capability in capabilities:
-        if not capability.startswith(("compute_", "sm_")):
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-        for prefix in ["compute_", "sm_"]:
-            if not capability.startswith(prefix):
-                continue
-            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
-                continue
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-
-    return capabilities
-
-def lib_name(base_name, version = None, static = False):
-    """Constructs the platform-specific name of a library.
-
-      Args:
-        base_name: The name of the library, such as "cudart"
-        version: The version of the library.
-        static: True the library is static or False if it is a shared object.
-
-      Returns:
-        The platform-specific name of the library.
-      """
-    version = "" if not version else "." + version
-    if static:
-        return "lib%s.a" % base_name
-    return "lib%s.so%s" % (base_name, version)
-
-def _lib_path(lib, basedir, version, static):
-    file_name = lib_name(lib, version, static)
-    return "%s/%s" % (basedir, file_name)
-
-def _should_check_soname(version, static):
-    return version and not static
-
-def _check_cuda_lib_params(lib, basedir, version, static = False):
-    return (
-        _lib_path(lib, basedir, version, static),
-        _should_check_soname(version, static),
-    )
-
-def _check_cuda_libs(repository_ctx, script_path, libs):
-    python_bin = get_python_bin(repository_ctx)
-    contents = repository_ctx.read(script_path).splitlines()
-
-    cmd = "from os import linesep;"
-    cmd += "f = open('script.py', 'w');"
-    for line in contents:
-        cmd += "f.write('%s' + linesep);" % line
-    cmd += "f.close();"
-    cmd += "from os import system;"
-    args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs])
-    cmd += "system('%s script.py %s');" % (python_bin, args)
-
-    all_paths = [path for path, _ in libs]
-    checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
-
-    if all_paths != checked_paths:
-        auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
-
-def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
-    """Returns the CUDA and cuDNN libraries on the system.
-
-      Also, verifies that the script actually exist.
-
-      Args:
-        repository_ctx: The repository context.
-        check_cuda_libs_script: The path to a script verifying that the cuda
-          libraries exist on the system.
-        cuda_config: The CUDA config as returned by _get_cuda_config
-
-      Returns:
-        Map of library names to structs of filename and path.
-      """
-    check_cuda_libs_params = {
-        "cuda": _check_cuda_lib_params(
-            "cuda",
-            cuda_config.config["cuda_library_dir"] + "/stubs",
-            version = None,
-        ),
-        "cudart": _check_cuda_lib_params(
-            "cudart",
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cudart_version,
-        ),
-        "cudart_static": _check_cuda_lib_params(
-            "cudart_static",
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cudart_version,
-            static = True,
-        ),
-        "cublas": _check_cuda_lib_params(
-            "cublas",
-            cuda_config.config["cublas_library_dir"],
-            cuda_config.cublas_version,
-        ),
-        "cublasLt": _check_cuda_lib_params(
-            "cublasLt",
-            cuda_config.config["cublas_library_dir"],
-            cuda_config.cublas_version,
-        ),
-        "cusolver": _check_cuda_lib_params(
-            "cusolver",
-            cuda_config.config["cusolver_library_dir"],
-            cuda_config.cusolver_version,
-        ),
-        "curand": _check_cuda_lib_params(
-            "curand",
-            cuda_config.config["curand_library_dir"],
-            cuda_config.curand_version,
-        ),
-        "cufft": _check_cuda_lib_params(
-            "cufft",
-            cuda_config.config["cufft_library_dir"],
-            cuda_config.cufft_version,
-        ),
-        "cudnn": _check_cuda_lib_params(
-            "cudnn",
-            cuda_config.config["cudnn_library_dir"],
-            cuda_config.cudnn_version,
-        ),
-        "cupti": _check_cuda_lib_params(
-            "cupti",
-            cuda_config.config["cupti_library_dir"],
-            cuda_config.cupti_version,
-        ),
-        "cusparse": _check_cuda_lib_params(
-            "cusparse",
-            cuda_config.config["cusparse_library_dir"],
-            cuda_config.cusparse_version,
-        ),
-    }
-
-    # Verify that the libs actually exist at their locations.
-    _check_cuda_libs(repository_ctx, check_cuda_libs_script, check_cuda_libs_params.values())
-
-    paths = {filename: v[0] for (filename, v) in check_cuda_libs_params.items()}
-    return paths
-
-# TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
-# and nccl_configure.bzl.
-def find_cuda_config(repository_ctx, cuda_libraries):
-    """Returns CUDA config dictionary from running find_cuda_config.py"""
-    python_bin = get_python_bin(repository_ctx)
-    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_cuda_config] + cuda_libraries)
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
-
-      Args:
-        repository_ctx: The repository context.
-
-      Returns:
-        A struct containing the following fields:
-          cuda_toolkit_path: The CUDA toolkit installation directory.
-          cudnn_install_basedir: The cuDNN installation directory.
-          cuda_version: The version of CUDA on the system.
-          cudart_version: The CUDA runtime version on the system.
-          cudnn_version: The version of cuDNN on the system.
-          compute_capabilities: A list of the system's CUDA compute capabilities.
-      """
-    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
-    toolkit_path = config["cuda_toolkit_path"]
-
-    cuda_version = config["cuda_version"].split(".")
-    cuda_major = cuda_version[0]
-    cuda_minor = cuda_version[1]
-
-    cuda_version = "%s.%s" % (cuda_major, cuda_minor)
-    cudnn_version = "%s" % config["cudnn_version"]
-
-    if int(cuda_major) >= 11:
-        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
-        if int(cuda_major) == 11:
-            cudart_version = "11.0"
-            cupti_version = cuda_version
-        else:
-            cudart_version = "%s" % cuda_major
-            cupti_version = cudart_version
-        cublas_version = "%s" % config["cublas_version"].split(".")[0]
-        cusolver_version = "%s" % config["cusolver_version"].split(".")[0]
-        curand_version = "%s" % config["curand_version"].split(".")[0]
-        cufft_version = "%s" % config["cufft_version"].split(".")[0]
-        cusparse_version = "%s" % config["cusparse_version"].split(".")[0]
-    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
-        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
-        # It changed from 'x.y' to just 'x' in CUDA 10.1.
-        cuda_lib_version = "%s" % cuda_major
-        cudart_version = cuda_version
-        cupti_version = cuda_version
-        cublas_version = cuda_lib_version
-        cusolver_version = cuda_lib_version
-        curand_version = cuda_lib_version
-        cufft_version = cuda_lib_version
-        cusparse_version = cuda_lib_version
-    else:
-        cudart_version = cuda_version
-        cupti_version = cuda_version
-        cublas_version = cuda_version
-        cusolver_version = cuda_version
-        curand_version = cuda_version
-        cufft_version = cuda_version
-        cusparse_version = cuda_version
-
-    return struct(
-        cuda_toolkit_path = toolkit_path,
-        cuda_version = cuda_version,
-        cupti_version = cupti_version,
-        cuda_version_major = cuda_major,
-        cudart_version = cudart_version,
-        cublas_version = cublas_version,
-        cusolver_version = cusolver_version,
-        curand_version = curand_version,
-        cufft_version = cufft_version,
-        cusparse_version = cusparse_version,
-        cudnn_version = cudnn_version,
-        compute_capabilities = compute_capabilities(repository_ctx),
-        config = config,
-    )
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/gpus/%s.tpl" % tpl),
-        substitutions,
-    )
-
-def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/gpus/%s.tpl" % label),
-        {},
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for cuda/.
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-            "%{cuda_gpu_architectures}": "[]",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{copy_rules}": """
-filegroup(name="cuda-include")
-filegroup(name="cublas-include")
-filegroup(name="cusolver-include")
-filegroup(name="cufft-include")
-filegroup(name="cusparse-include")
-filegroup(name="curand-include")
-filegroup(name="cudnn-include")
-""",
-        },
-    )
-
-    # Create dummy files for the CUDA toolkit since they are still required by
-    # tensorflow/tsl/platform/default/build_config:cuda.
-    repository_ctx.file("cuda/cuda/include/cuda.h")
-    repository_ctx.file("cuda/cuda/include/cublas.h")
-    repository_ctx.file("cuda/cuda/include/cudnn.h")
-    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-    repository_ctx.file("cuda/cuda/lib/libcuda.so")
-    repository_ctx.file("cuda/cuda/lib/libcudart_static.a")
-    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": "",
-            "%{cudart_version}": "",
-            "%{cupti_version}": "",
-            "%{cublas_version}": "",
-            "%{cusolver_version}": "",
-            "%{curand_version}": "",
-            "%{cufft_version}": "",
-            "%{cusparse_version}": "",
-            "%{cudnn_version}": "",
-            "%{cuda_toolkit_path}": "",
-            "%{cuda_compute_capabilities}": "",
-        },
-        "cuda/cuda/cuda_config.h",
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.py",
-        _py_tmpl_dict({}),
-        "cuda/cuda/cuda_config.py",
-    )
-
-    # If cuda_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=cuda, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def make_copy_files_rule(repository_ctx, name, srcs, outs):
-    """Returns a rule to copy a set of files."""
-    cmds = []
-
-    # Copy files.
-    for src, out in zip(srcs, outs):
-        cmds.append('cp -f "%s" "$(location %s)"' % (src, out))
-    outs = [('        "%s",' % out) for out in outs]
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""%s \""",
-)""" % (name, "\n".join(outs), " && \\\n".join(cmds))
-
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
-    """Returns a rule to recursively copy a directory.
-    If exceptions is not None, it must be a list of files or directories in
-    'src_dir'; these will be excluded from copying.
-    """
-    src_dir = _norm_path(src_dir)
-    out_dir = _norm_path(out_dir)
-    outs = read_dir(repository_ctx, src_dir)
-    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-
-    # '@D' already contains the relative path for a single file, see
-    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
-    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
-
-def _flag_enabled(repository_ctx, flag_name):
-    return get_host_environ(repository_ctx, flag_name) == "1"
-
-def _use_cuda_clang(repository_ctx):
-    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
-
-def _tf_sysroot(repository_ctx):
-    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
-
-def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    copts = ["--no-cuda-include-ptx=all"] if _use_cuda_clang(repository_ctx) else []
-    for capability in compute_capabilities:
-        if capability.startswith("compute_"):
-            capability = capability.replace("compute_", "sm_")
-            copts.append("--cuda-include-ptx=%s" % capability)
-        copts.append("--cuda-gpu-arch=%s" % capability)
-
-    return str(copts)
-
-def _tpl_path(repository_ctx, filename):
-    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
-
-def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    tpl_paths = {filename: _tpl_path(repository_ctx, filename) for filename in [
-        "cuda:BUILD",
-        "cuda:build_defs.bzl",
-        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        "crosstool:BUILD",
-        "crosstool:cc_toolchain_config.bzl",
-        "cuda:cuda_config.h",
-        "cuda:cuda_config.py",
-    ]}
-
-    cuda_config = _get_cuda_config(repository_ctx)
-
-    cuda_include_path = cuda_config.config["cuda_include_dir"]
-    cublas_include_path = cuda_config.config["cublas_include_dir"]
-    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
-    cupti_header_dir = cuda_config.config["cupti_include_dir"]
-    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]
-    nvml_header_dir = cuda_config.config["nvml_header_dir"]
-
-    # Create genrule to copy files from the installed CUDA toolkit into execroot.
-    copy_rules = [
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-include",
-            src_dir = cuda_include_path,
-            out_dir = "cuda/include",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-nvvm",
-            src_dir = nvvm_libdevice_dir,
-            out_dir = "cuda/nvvm/libdevice",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-extras",
-            src_dir = cupti_header_dir,
-            out_dir = "cuda/extras/CUPTI/include",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "nvml",
-            src_dir = nvml_header_dir,
-            out_dir = "cuda/nvml/include",
-        ),
-    ]
-
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cublas-include",
-        srcs = [
-            cublas_include_path + "/cublas.h",
-            cublas_include_path + "/cublas_v2.h",
-            cublas_include_path + "/cublas_api.h",
-            cublas_include_path + "/cublasLt.h",
-        ],
-        outs = [
-            "cublas/include/cublas.h",
-            "cublas/include/cublas_v2.h",
-            "cublas/include/cublas_api.h",
-            "cublas/include/cublasLt.h",
-        ],
-    ))
-
-    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cusolver-include",
-        srcs = [
-            cusolver_include_path + "/cusolver_common.h",
-            cusolver_include_path + "/cusolverDn.h",
-        ],
-        outs = [
-            "cusolver/include/cusolver_common.h",
-            "cusolver/include/cusolverDn.h",
-        ],
-    ))
-
-    cufft_include_path = cuda_config.config["cufft_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cufft-include",
-        srcs = [
-            cufft_include_path + "/cufft.h",
-        ],
-        outs = [
-            "cufft/include/cufft.h",
-        ],
-    ))
-
-    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cusparse-include",
-        srcs = [
-            cusparse_include_path + "/cusparse.h",
-        ],
-        outs = [
-            "cusparse/include/cusparse.h",
-        ],
-    ))
-
-    curand_include_path = cuda_config.config["curand_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "curand-include",
-        srcs = [
-            curand_include_path + "/curand.h",
-        ],
-        outs = [
-            "curand/include/curand.h",
-        ],
-    ))
-
-    check_cuda_libs_script = repository_ctx.path(Label("@local_xla//third_party/gpus:check_cuda_libs.py"))
-    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
-    cuda_lib_srcs = []
-    cuda_lib_outs = []
-    for path in cuda_libs.values():
-        cuda_lib_srcs.append(path)
-        cuda_lib_outs.append("cuda/lib/" + path.rpartition("/")[-1])
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cuda-lib",
-        srcs = cuda_lib_srcs,
-        outs = cuda_lib_outs,
-    ))
-
-    # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
-    bin_files = ["crt/link.stub", "bin2c", "fatbinary", "nvlink", "nvprune"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cuda-bin",
-        srcs = [cuda_config.cuda_toolkit_path + "/bin/" + f for f in bin_files],
-        outs = ["cuda/bin/" + f for f in bin_files],
-    ))
-
-    # Select the headers based on the cuDNN version.
-    cudnn_headers = ["cudnn.h"]
-    if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
-        cudnn_headers += [
-            "cudnn_backend.h",
-            "cudnn_adv_infer.h",
-            "cudnn_adv_train.h",
-            "cudnn_cnn_infer.h",
-            "cudnn_cnn_train.h",
-            "cudnn_ops_infer.h",
-            "cudnn_ops_train.h",
-            "cudnn_version.h",
-        ]
-
-    cudnn_srcs = []
-    cudnn_outs = []
-    for header in cudnn_headers:
-        cudnn_srcs.append(cudnn_header_dir + "/" + header)
-        cudnn_outs.append("cudnn/include/" + header)
-
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cudnn-include",
-        srcs = cudnn_srcs,
-        outs = cudnn_outs,
-    ))
-
-    # Set up BUILD file for cuda/
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        tpl_paths["cuda:build_defs.bzl"],
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                cuda_config.compute_capabilities,
-            ),
-            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
-        },
-    )
-
-    repository_ctx.template(
-        "cuda/BUILD",
-        tpl_paths["cuda:BUILD"],
-        {
-            "%{copy_rules}": "\n".join(copy_rules),
-        },
-    )
-
-    is_cuda_clang = _use_cuda_clang(repository_ctx)
-    tf_sysroot = _tf_sysroot(repository_ctx)
-
-    should_download_clang = is_cuda_clang and _flag_enabled(
-        repository_ctx,
-        _TF_DOWNLOAD_CLANG,
-    )
-    if should_download_clang:
-        download_clang(repository_ctx, "crosstool/extra_tools")
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx)
-    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-    host_compiler_includes = get_cxx_inc_directories(
-        repository_ctx,
-        cc_fullpath,
-        tf_sysroot,
-    )
-    cuda_defines = {}
-    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
-    cuda_defines["%{cuda_toolkit_path}"] = ""
-    cuda_defines["%{compiler}"] = "unknown"
-    if is_cuda_clang:
-        cuda_defines["%{cuda_toolkit_path}"] = cuda_config.config["cuda_toolkit_path"]
-        cuda_defines["%{compiler}"] = "clang"
-
-    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX)
-    if not host_compiler_prefix:
-        host_compiler_prefix = "/usr/bin"
-
-    cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix
-
-    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-    # https://github.com/bazelbuild/bazel/issues/760).
-    # However, this stops our custom clang toolchain from picking the provided
-    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-    # toolchain.
-    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-    #       flag from the CROSSTOOL completely (see
-    #       https://github.com/bazelbuild/bazel/issues/5634)
-    if should_download_clang:
-        cuda_defines["%{linker_bin_path}"] = ""
-    else:
-        cuda_defines["%{linker_bin_path}"] = host_compiler_prefix
-
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    cuda_defines["%{unfiltered_compile_flags}"] = ""
-    if is_cuda_clang:
-        cuda_defines["%{host_compiler_path}"] = str(cc)
-        cuda_defines["%{host_compiler_warnings}"] = """
-        # Some parts of the codebase set -Werror and hit this warning, so
-        # switch it off for now.
-        "-Wno-invalid-partial-specialization"
-    """
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
-        cuda_defines["%{compiler_deps}"] = ":empty"
-        repository_ctx.file(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            "",
-        )
-    else:
-        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{host_compiler_warnings}"] = ""
-
-        # nvcc has the system include paths built in and will automatically
-        # search them; we cannot work around that, so we add the relevant cuda
-        # system paths to the allowed compiler specific include paths.
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-            host_compiler_includes + _cuda_include_path(
-                repository_ctx,
-                cuda_config,
-            ) + [cupti_header_dir, cudnn_header_dir, nvml_header_dir],
-        )
-
-        # For gcc, do not canonicalize system header paths; some versions of gcc
-        # pick the shortest possible path for system includes when creating the
-        # .d file - given that includes that are prefixed with "../" multiple
-        # time quickly grow longer than the root of the tree, this can lead to
-        # bazel's header check failing.
-        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-
-        nvcc_path = "%s/nvcc" % cuda_config.config["cuda_binary_dir"]
-        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
-
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{nvcc_path}": nvcc_path,
-            "%{gcc_host_compiler_path}": str(cc),
-        }
-        repository_ctx.template(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-            wrapper_defines,
-        )
-
-    verify_build_defines(cuda_defines)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        tpl_paths["crosstool:BUILD"],
-        cuda_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        tpl_paths["crosstool:cc_toolchain_config.bzl"],
-        {},
-    )
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        tpl_paths["cuda:cuda_config.h"],
-        {
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{cudart_version}": cuda_config.cudart_version,
-            "%{cupti_version}": cuda_config.cupti_version,
-            "%{cublas_version}": cuda_config.cublas_version,
-            "%{cusolver_version}": cuda_config.cusolver_version,
-            "%{curand_version}": cuda_config.curand_version,
-            "%{cufft_version}": cuda_config.cufft_version,
-            "%{cusparse_version}": cuda_config.cusparse_version,
-            "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-            "%{cuda_compute_capabilities}": ", ".join([
-                cc.split("_")[1]
-                for cc in cuda_config.compute_capabilities
-            ]),
-        },
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        tpl_paths["cuda:cuda_config.py"],
-        _py_tmpl_dict({
-            "cuda_version": cuda_config.cuda_version,
-            "cudnn_version": cuda_config.cudnn_version,
-            "cuda_compute_capabilities": cuda_config.compute_capabilities,
-            "cpu_compiler": str(cc),
-        }),
-    )
-
-def _py_tmpl_dict(d):
-    return {"%{cuda_config}": str(d)}
-
-def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with CUDA."""
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                compute_capabilities(repository_ctx),
-            ),
-        },
-    )
-    repository_ctx.template(
-        "cuda/BUILD",
-        config_repo_label(remote_config_repo, "cuda:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        config_repo_label(remote_config_repo, "cuda:build_defs.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        config_repo_label(remote_config_repo, "cuda:cuda/cuda_config.h"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        config_repo_label(remote_config_repo, "cuda:cuda/cuda_config.py"),
-        _py_tmpl_dict({}),
-    )
-
-    repository_ctx.template(
-        "crosstool/BUILD",
-        config_repo_label(remote_config_repo, "crosstool:BUILD"),
-        {},
-    )
-
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
-        {},
-    )
-
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
-        {},
-    )
-
-def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    build_file = Label("//third_party/gpus:local_config_cuda.BUILD")
-
-    if not enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO) != None:
-        has_cuda_version = get_host_environ(repository_ctx, _TF_CUDA_VERSION) != None
-        has_cudnn_version = get_host_environ(repository_ctx, _TF_CUDNN_VERSION) != None
-        if not has_cuda_version or not has_cudnn_version:
-            auto_configure_fail("%s and %s must also be set if %s is specified" %
-                                (_TF_CUDA_VERSION, _TF_CUDNN_VERSION, _TF_CUDA_CONFIG_REPO))
-        _create_remote_cuda_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO),
-        )
-    else:
-        _create_local_cuda_repository(repository_ctx)
-
-    repository_ctx.symlink(build_file, "BUILD")
-
-_ENVIRONS = [
-    _GCC_HOST_COMPILER_PATH,
-    _GCC_HOST_COMPILER_PREFIX,
-    _CLANG_CUDA_COMPILER_PATH,
-    "TF_NEED_CUDA",
-    "TF_CUDA_CLANG",
-    _TF_DOWNLOAD_CLANG,
-    _CUDA_TOOLKIT_PATH,
-    _CUDNN_INSTALL_PATH,
-    _TF_CUDA_VERSION,
-    _TF_CUDNN_VERSION,
-    _TF_CUDA_COMPUTE_CAPABILITIES,
-    "NVVMIR_LIBRARY_DIR",
-    _PYTHON_BIN_PATH,
-    "TMP",
-    "TMPDIR",
-    "TF_CUDA_PATHS",
-]
-
-remote_cuda_configure = repository_rule(
-    implementation = _create_local_cuda_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-
-cuda_configure = repository_rule(
-    implementation = _cuda_autoconf_impl,
-    environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
-    attrs = {
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-"""Detects and configures the local CUDA toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-cuda_configure(name = "local_config_cuda")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/gpus/find_cuda_config.py b/third_party/xla/third_party/gpus/find_cuda_config.py
deleted file mode 100644
index 78292c7b40237a..00000000000000
--- a/third_party/xla/third_party/gpus/find_cuda_config.py
+++ /dev/null
@@ -1,614 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints CUDA library and header directories and versions found on the system.
-
-The script searches for CUDA library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout.
-The paths to inspect and the required versions are specified through environment
-variables. If no valid configuration is found, the script prints to stderr and
-returns an error code.
-
-The list of libraries to find is specified as arguments. Supported libraries are
-CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.
-
-The script takes a list of base directories specified by the TF_CUDA_PATHS
-environment variable as comma-separated glob list. The script looks for headers
-and library files in a hard-coded set of subdirectories from these base paths.
-If TF_CUDA_PATHS is not specified, a OS specific default is used:
-
-  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
-
-For backwards compatibility, some libraries also use alternative base
-directories from other environment variables if they are specified. List of
-library-specific environment variables:
-
-  Library   Version env variable  Additional base directories
-  ----------------------------------------------------------------
-  CUDA      TF_CUDA_VERSION       CUDA_TOOLKIT_PATH
-  cuBLAS    TF_CUBLAS_VERSION     CUDA_TOOLKIT_PATH
-  cuDNN     TF_CUDNN_VERSION      CUDNN_INSTALL_PATH
-  NCCL      TF_NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
-  TensorRT  TF_TENSORRT_VERSION   TENSORRT_INSTALL_PATH
-
-Versions environment variables can be of the form 'x' or 'x.y' to request a
-specific version, empty or unspecified to accept any version.
-
-The output of a found library is of the form:
-tf_<library>_version: x.y.z
-tf_<library>_header_dir: ...
-tf_<library>_library_dir: ...
-"""
-
-import io
-import os
-import glob
-import re
-import subprocess
-import sys
-
-# pylint: disable=g-import-not-at-top
-try:
-  from shutil import which
-except ImportError:
-  from distutils.spawn import find_executable as which
-# pylint: enable=g-import-not-at-top
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _matches_version(actual_version, required_version):
-  """Checks whether some version meets the requirements.
-
-      All elements of the required_version need to be present in the
-      actual_version.
-
-          required_version  actual_version  result
-          -----------------------------------------
-          1                 1.1             True
-          1.2               1               False
-          1.2               1.3             False
-                            1               True
-
-      Args:
-        required_version: The version specified by the user.
-        actual_version: The version detected from the CUDA installation.
-      Returns: Whether the actual version matches the required one.
-  """
-  if actual_version is None:
-    return False
-
-  # Strip spaces from the versions.
-  actual_version = actual_version.strip()
-  required_version = required_version.strip()
-  return actual_version.startswith(required_version)
-
-
-def _at_least_version(actual_version, required_version):
-  actual = [int(v) for v in actual_version.split(".")]
-  required = [int(v) for v in required_version.split(".")]
-  return actual >= required
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8").readlines():
-    match = re.match(r"\s*#\s*define %s\s+(\d+)" % name, line)
-    if match:
-      return match.group(1)
-  return ""
-
-
-def _cartesian_product(first, second):
-  """Returns all path combinations of first and second."""
-  return [os.path.join(f, s) for f in first for s in second]
-
-
-def _get_ld_config_paths():
-  """Returns all directories from 'ldconfig -p'."""
-  ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
-  output = subprocess.check_output([ldconfig_path, "-p"])
-  pattern = re.compile(".* => (.*)")
-  result = set()
-  for line in output.splitlines():
-    try:
-      match = pattern.match(line.decode("ascii"))
-    except UnicodeDecodeError:
-      match = False
-    if match:
-      result.add(os.path.dirname(match.group(1)))
-  return sorted(list(result))
-
-
-def _get_default_cuda_paths(cuda_version):
-  if not cuda_version:
-    cuda_version = "*"
-  elif not "." in cuda_version:
-    cuda_version = cuda_version + ".*"
-
-  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
-         "/usr/local/cudnn"] + _get_ld_config_paths()
-
-
-def _header_paths():
-  """Returns hard-coded set of relative paths to look for header files."""
-  return [
-      "",
-      "include",
-      "include/cuda",
-      "include/*-linux-gnu",
-      "extras/CUPTI/include",
-      "include/cuda/CUPTI",
-      "local/cuda/extras/CUPTI/include",
-      "targets/x86_64-linux/include",
-  ]
-
-
-def _library_paths():
-  """Returns hard-coded set of relative paths to look for library files."""
-  return [
-      "",
-      "lib64",
-      "lib",
-      "lib/*-linux-gnu",
-      "lib/x64",
-      "extras/CUPTI/*",
-      "local/cuda/lib64",
-      "local/cuda/extras/CUPTI/lib64",
-  ]
-
-
-def _not_found_error(base_paths, relative_paths, filepattern):
-  base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
-  relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
-  return ConfigError(
-      "Could not find any %s in any subdirectory:%s\nof:%s\n" %
-      (filepattern, relative_paths, base_paths))
-
-
-def _find_file(base_paths, relative_paths, filepattern):
-  for path in _cartesian_product(base_paths, relative_paths):
-    for file in glob.glob(os.path.join(path, filepattern)):
-      return file
-  raise _not_found_error(base_paths, relative_paths, filepattern)
-
-
-def _find_library(base_paths, library_name, required_version):
-  """Returns first valid path to the requested library."""
-  filepattern = ".".join(["lib" + library_name, "so"] +
-                         required_version.split(".")[:1]) + "*"
-  return _find_file(base_paths, _library_paths(), filepattern)
-
-
-def _find_versioned_file(base_paths, relative_paths, filepatterns,
-                         required_version, get_version):
-  """Returns first valid path to a file that matches the requested version."""
-  if type(filepatterns) not in [list, tuple]:
-    filepatterns = [filepatterns]
-  for path in _cartesian_product(base_paths, relative_paths):
-    for filepattern in filepatterns:
-      for file in glob.glob(os.path.join(path, filepattern)):
-        actual_version = get_version(file)
-        if _matches_version(actual_version, required_version):
-          return file, actual_version
-  raise _not_found_error(
-      base_paths, relative_paths,
-      ", ".join(filepatterns) + " matching version '%s'" % required_version)
-
-
-def _find_header(base_paths, header_name, required_version, get_version):
-  """Returns first valid path to a header that matches the requested version."""
-  return _find_versioned_file(base_paths, _header_paths(), header_name,
-                              required_version, get_version)
-
-
-def _find_cuda_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = int(_get_header_version(path, "CUDA_VERSION"))
-    if not version:
-      return None
-    return "%d.%d" % (version // 1000, version % 1000 // 10)
-
-  cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
-                                                  required_version,
-                                                  get_header_version)
-  cuda_version = header_version  # x.y, see above.
-
-  cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
-
-  def get_nvcc_version(path):
-    pattern = r"Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
-    for line in subprocess.check_output([path, "--version"]).splitlines():
-      match = re.match(pattern, line.decode("ascii"))
-      if match:
-        return match.group(1)
-    return None
-
-  nvcc_name = "nvcc"
-  nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
-      "",
-      "bin",
-      "local/cuda/bin",
-  ], nvcc_name, cuda_version, get_nvcc_version)
-
-  nvvm_path = _find_file(base_paths, [
-      "nvvm/libdevice",
-      "share/cuda",
-      "lib/nvidia-cuda-toolkit/libdevice",
-      "local/cuda/nvvm/libdevice",
-  ], "libdevice*.10.bc")
-
-  cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
-  nvml_header_dir = _find_file(base_paths, _header_paths(), "nvml.h")
-  cupti_library_path = _find_library(base_paths, "cupti", required_version)
-
-  cuda_binary_dir = os.path.dirname(nvcc_path)
-  nvvm_library_dir = os.path.dirname(nvvm_path)
-
-  # XLA requires the toolkit path to find ptxas and libdevice.
-  # TODO(csigg): pass in both directories instead.
-  cuda_toolkit_paths = (
-      os.path.normpath(os.path.join(cuda_binary_dir, "..")),
-      os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
-  )
-  if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
-    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
-                      cuda_toolkit_paths)
-
-  return {
-      "cuda_version": cuda_version,
-      "cuda_include_dir": os.path.dirname(cuda_header_path),
-      "cuda_library_dir": os.path.dirname(cuda_library_path),
-      "cuda_binary_dir": cuda_binary_dir,
-      "nvvm_library_dir": nvvm_library_dir,
-      "cupti_include_dir": os.path.dirname(cupti_header_path),
-      "cupti_library_dir": os.path.dirname(cupti_library_path),
-      "cuda_toolkit_path": cuda_toolkit_paths[0],
-      "nvml_header_dir": os.path.dirname(nvml_header_dir),
-  }
-
-
-def _find_cublas_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "10.1"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR",
-                       "CUBLAS_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cublas_api.h",
-                                               required_version,
-                                               get_header_version)
-    # cuBLAS uses the major version only.
-    cublas_version = header_version.split(".")[0]
-
-  else:
-    # There is no version info available before CUDA 10.1, just find the file.
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
-    # cuBLAS version is the same as CUDA version (x.y).
-    cublas_version = required_version
-
-  library_path = _find_library(base_paths, "cublas", cublas_version)
-
-  return {
-      "cublas_version": header_version,
-      "cublas_include_dir": os.path.dirname(header_path),
-      "cublas_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusolver_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
-                       "CUSOLVER_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
-                                               required_version,
-                                               get_header_version)
-    cusolver_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
-    cusolver_version = required_version
-
-  library_path = _find_library(base_paths, "cusolver", cusolver_version)
-
-  return {
-      "cusolver_version": header_version,
-      "cusolver_include_dir": os.path.dirname(header_path),
-      "cusolver_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_curand_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
-                       "CURAND_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "curand.h",
-                                               required_version,
-                                               get_header_version)
-    curand_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "curand.h")
-    curand_version = required_version
-
-  library_path = _find_library(base_paths, "curand", curand_version)
-
-  return {
-      "curand_version": header_version,
-      "curand_include_dir": os.path.dirname(header_path),
-      "curand_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cufft_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cufft.h",
-                                               required_version,
-                                               get_header_version)
-    cufft_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cufft.h")
-    cufft_version = required_version
-
-  library_path = _find_library(base_paths, "cufft", cufft_version)
-
-  return {
-      "cufft_version": header_version,
-      "cufft_include_dir": os.path.dirname(header_path),
-      "cufft_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cudnn_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = [
-        _get_header_version(path, name)
-        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
-    return ".".join(version) if version[0] else None
-
-  header_path, header_version = _find_header(base_paths,
-                                             ("cudnn.h", "cudnn_version.h"),
-                                             required_version,
-                                             get_header_version)
-  cudnn_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "cudnn", cudnn_version)
-
-  return {
-      "cudnn_version": cudnn_version,
-      "cudnn_include_dir": os.path.dirname(header_path),
-      "cudnn_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusparse_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
-                       "CUSPARSE_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusparse.h",
-                                               required_version,
-                                               get_header_version)
-    cusparse_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
-    cusparse_version = required_version
-
-  library_path = _find_library(base_paths, "cusparse", cusparse_version)
-
-  return {
-      "cusparse_version": header_version,
-      "cusparse_include_dir": os.path.dirname(header_path),
-      "cusparse_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_nccl_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (
-        _get_header_version(path, name)
-        for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
-    return ".".join(version)
-
-  header_path, header_version = _find_header(base_paths, "nccl.h",
-                                             required_version,
-                                             get_header_version)
-  nccl_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "nccl", nccl_version)
-
-  return {
-      "nccl_version": nccl_version,
-      "nccl_include_dir": os.path.dirname(header_path),
-      "nccl_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_tensorrt_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (
-        _get_header_version(path, name)
-        for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
-                     "NV_TENSORRT_PATCH"))
-    # `version` is a generator object, so we convert it to a list before using
-    # it (muitiple times below).
-    version = list(version)
-    if not all(version):
-      return None  # Versions not found, make _matches_version returns False.
-    return ".".join(version)
-
-  header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
-                                             required_version,
-                                             get_header_version)
-
-  tensorrt_version = header_version.split(".")[0]
-  library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
-
-  return {
-      "tensorrt_version": header_version,
-      "tensorrt_include_dir": os.path.dirname(header_path),
-      "tensorrt_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _list_from_env(env_name, default=[]):
-  """Returns comma-separated list from environment variable."""
-  if env_name in os.environ:
-    return os.environ[env_name].split(",")
-  return default
-
-
-def _get_legacy_path(env_name, default=[]):
-  """Returns a path specified by a legacy environment variable.
-
-  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
-  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
-  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
-  """
-  if env_name in os.environ:
-    match = re.match(r"^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
-    if match:
-      return [match.group(1)]
-  return _list_from_env(env_name, default)
-
-
-def find_cuda_config():
-  """Returns a dictionary of CUDA library and header file paths."""
-  libraries = [argv.lower() for argv in sys.argv[1:]]
-  cuda_version = os.environ.get("TF_CUDA_VERSION", "")
-  base_paths = _list_from_env("TF_CUDA_PATHS",
-                              _get_default_cuda_paths(cuda_version))
-  base_paths = [path for path in base_paths if os.path.exists(path)]
-
-  result = {}
-  if "cuda" in libraries:
-    cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
-    res = _find_cuda_config(cuda_paths, cuda_version)
-
-    result.update(res)
-
-    cuda_version = result["cuda_version"]
-    cublas_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
-      # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
-      cublas_paths = cuda_paths
-    cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(
-        _find_cublas_config(cublas_paths, cublas_version, cuda_version))
-
-    cusolver_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusolver_paths = cuda_paths
-    cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
-    result.update(
-        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
-
-    curand_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      curand_paths = cuda_paths
-    curand_version = os.environ.get("TF_CURAND_VERSION", "")
-    result.update(
-        _find_curand_config(curand_paths, curand_version, cuda_version))
-
-    cufft_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cufft_paths = cuda_paths
-    cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
-    result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version))
-
-    cusparse_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusparse_paths = cuda_paths
-    cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
-    result.update(
-        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
-
-  if "cudnn" in libraries:
-    cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
-    cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
-    result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
-
-  if "nccl" in libraries:
-    nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
-    nccl_version = os.environ.get("TF_NCCL_VERSION", "")
-    result.update(_find_nccl_config(nccl_paths, nccl_version))
-
-  if "tensorrt" in libraries:
-    tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
-    tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
-    result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
-
-  for k, v in result.items():
-    if k.endswith("_dir") or k.endswith("_path"):
-      result[k] = os.path.realpath(v)
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_cuda_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write(str(e) + '\n')
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/xla/third_party/gpus/find_rocm_config.py b/third_party/xla/third_party/gpus/find_rocm_config.py
deleted file mode 100644
index cd64efe6495bb4..00000000000000
--- a/third_party/xla/third_party/gpus/find_rocm_config.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints ROCm library and header directories and versions found on the system.
-
-The script searches for ROCm library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout.
-The path to inspect is specified through an environment variable (ROCM_PATH).
-If no valid configuration is found, the script prints to stderr and
-returns an error code.
-
-The script takes the directory specified by the ROCM_PATH environment variable.
-The script looks for headers and library files in a hard-coded set of
-subdirectories from base path of the specified directory. If ROCM_PATH is not
-specified, then "/opt/rocm" is used as it default value
-
-"""
-
-import io
-import os
-import re
-import sys
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _get_default_rocm_path():
-  return "/opt/rocm"
-
-
-def _get_rocm_install_path():
-  """Determines and returns the ROCm installation path."""
-  rocm_install_path = _get_default_rocm_path()
-  if "ROCM_PATH" in os.environ:
-    rocm_install_path = os.environ["ROCM_PATH"]
-  # rocm_install_path = os.path.realpath(rocm_install_path)
-  return rocm_install_path
-
-
-def _get_composite_version_number(major, minor, patch):
-  return 10000 * major + 100 * minor + patch
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8"):
-    match = re.match(r"#define %s +(\d+)" % name, line)
-    if match:
-      value = match.group(1)
-      return int(value)
-
-  raise ConfigError('#define "{}" is either\n'.format(name) +
-                    "  not present in file {} OR\n".format(path) +
-                    "  its value is not an integer literal")
-
-
-def _find_rocm_config(rocm_install_path):
-
-  def rocm_version_numbers(path):
-    possible_version_files = [
-        "include/rocm-core/rocm_version.h",  # ROCm 5.2
-        "include/rocm_version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "ROCm version file not found in {}".format(possible_version_files))
-
-    major = _get_header_version(version_file, "ROCM_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCM_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCM_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocm_version_numbers(rocm_install_path)
-
-  rocm_config = {
-      "rocm_version_number": _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocm_config
-
-
-def _find_hipruntime_config(rocm_install_path):
-
-  def hipruntime_version_number(path):
-    possible_version_files = [
-        "include/hip/hip_version.h",  # ROCm 5.2
-        "hip/include/hip/hip_version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("HIP Runtime version file not found in {}".format(
-          possible_version_files))
-
-    # This header file has an explicit #define for HIP_VERSION, whose value
-    # is (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)
-    # Retreive the major + minor and re-calculate here, since we do not
-    # want get into the business of parsing arith exprs
-    major = _get_header_version(version_file, "HIP_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "HIP_VERSION_MINOR")
-    return 100 * major + minor
-
-  hipruntime_config = {
-      "hipruntime_version_number": hipruntime_version_number(rocm_install_path)
-  }
-
-  return hipruntime_config
-
-
-def _find_miopen_config(rocm_install_path):
-
-  def miopen_version_numbers(path):
-    possible_version_files = [
-        "include/miopen/version.h",  # ROCm 5.2 and prior
-        "miopen/include/miopen/version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          'MIOpen version file "{}" not found'.format(version_file))
-    major = _get_header_version(version_file, "MIOPEN_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "MIOPEN_VERSION_MINOR")
-    patch = _get_header_version(version_file, "MIOPEN_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = miopen_version_numbers(rocm_install_path)
-
-  miopen_config = {
-      "miopen_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return miopen_config
-
-
-def _find_rocblas_config(rocm_install_path):
-
-  def rocblas_version_numbers(path):
-    possible_version_files = [
-        "include/rocblas/internal/rocblas-version.h",  # ROCm 5.2
-        "rocblas/include/internal/rocblas-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocblas version file not found in {}".format(
-              possible_version_files))
-    major = _get_header_version(version_file, "ROCBLAS_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCBLAS_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCBLAS_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocblas_version_numbers(rocm_install_path)
-
-  rocblas_config = {
-      "rocblas_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocblas_config
-
-
-def _find_rocrand_config(rocm_install_path):
-
-  def rocrand_version_number(path):
-    possible_version_files = [
-        "include/rocrand/rocrand_version.h",  # ROCm 5.1
-        "rocrand/include/rocrand_version.h",  # ROCm 5.0 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocrand version file not found in {}".format(possible_version_files))
-    version_number = _get_header_version(version_file, "ROCRAND_VERSION")
-    return version_number
-
-  rocrand_config = {
-      "rocrand_version_number": rocrand_version_number(rocm_install_path)
-  }
-
-  return rocrand_config
-
-
-def _find_rocfft_config(rocm_install_path):
-
-  def rocfft_version_numbers(path):
-    possible_version_files = [
-        "include/rocfft/rocfft-version.h",  # ROCm 5.2
-        "rocfft/include/rocfft-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocfft version file not found in {}".format(possible_version_files))
-    major = _get_header_version(version_file, "rocfft_version_major")
-    minor = _get_header_version(version_file, "rocfft_version_minor")
-    patch = _get_header_version(version_file, "rocfft_version_patch")
-    return major, minor, patch
-
-  major, minor, patch = rocfft_version_numbers(rocm_install_path)
-
-  rocfft_config = {
-      "rocfft_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocfft_config
-
-
-def _find_hipfft_config(rocm_install_path):
-
-  def hipfft_version_numbers(path):
-    possible_version_files = [
-        "include/hipfft/hipfft-version.h",  # ROCm 5.2
-        "hipfft/include/hipfft-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "hipfft version file not found in {}".format(possible_version_files))
-    major = _get_header_version(version_file, "hipfftVersionMajor")
-    minor = _get_header_version(version_file, "hipfftVersionMinor")
-    patch = _get_header_version(version_file, "hipfftVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipfft_version_numbers(rocm_install_path)
-
-  hipfft_config = {
-      "hipfft_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipfft_config
-
-
-def _find_roctracer_config(rocm_install_path):
-
-  def roctracer_version_numbers(path):
-    possible_version_files = [
-        "include/roctracer/roctracer.h",  # ROCm 5.2
-        "roctracer/include/roctracer.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("roctracer version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "ROCTRACER_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCTRACER_VERSION_MINOR")
-    # roctracer header does not have a patch version number
-    patch = 0
-    return major, minor, patch
-
-  major, minor, patch = roctracer_version_numbers(rocm_install_path)
-
-  roctracer_config = {
-      "roctracer_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return roctracer_config
-
-
-def _find_hipsparse_config(rocm_install_path):
-
-  def hipsparse_version_numbers(path):
-    possible_version_files = [
-        "include/hipsparse/hipsparse-version.h",  # ROCm 5.2
-        "hipsparse/include/hipsparse-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("hipsparse version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "hipsparseVersionMajor")
-    minor = _get_header_version(version_file, "hipsparseVersionMinor")
-    patch = _get_header_version(version_file, "hipsparseVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipsparse_version_numbers(rocm_install_path)
-
-  hipsparse_config = {
-      "hipsparse_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipsparse_config
-
-def _find_hipsolver_config(rocm_install_path):
-
-  def hipsolver_version_numbers(path):
-    possible_version_files = [
-        "include/hipsolver/internal/hipsolver-version.h",  # ROCm 5.2
-        "hipsolver/include/internal/hipsolver-version.h",  # ROCm 5.1
-        "hipsolver/include/hipsolver-version.h",  # ROCm 5.0 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("hipsolver version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "hipsolverVersionMajor")
-    minor = _get_header_version(version_file, "hipsolverVersionMinor")
-    patch = _get_header_version(version_file, "hipsolverVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipsolver_version_numbers(rocm_install_path)
-
-  hipsolver_config = {
-      "hipsolver_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipsolver_config
-
-
-def _find_rocsolver_config(rocm_install_path):
-
-  def rocsolver_version_numbers(path):
-    possible_version_files = [
-        "include/rocsolver/rocsolver-version.h",  # ROCm 5.2
-        "rocsolver/include/rocsolver-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("rocsolver version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "ROCSOLVER_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCSOLVER_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCSOLVER_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocsolver_version_numbers(rocm_install_path)
-
-  rocsolver_config = {
-      "rocsolver_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocsolver_config
-
-
-def find_rocm_config():
-  """Returns a dictionary of ROCm components config info."""
-  rocm_install_path = _get_rocm_install_path()
-  if not os.path.exists(rocm_install_path):
-    raise ConfigError(
-        'Specified ROCM_PATH "{}" does not exist'.format(rocm_install_path))
-
-  result = {}
-
-  result["rocm_toolkit_path"] = rocm_install_path
-  result.update(_find_rocm_config(rocm_install_path))
-  result.update(_find_hipruntime_config(rocm_install_path))
-  result.update(_find_miopen_config(rocm_install_path))
-  result.update(_find_rocblas_config(rocm_install_path))
-  result.update(_find_rocrand_config(rocm_install_path))
-  result.update(_find_rocfft_config(rocm_install_path))
-  if result["rocm_version_number"] >= 40100:
-    result.update(_find_hipfft_config(rocm_install_path))
-  result.update(_find_roctracer_config(rocm_install_path))
-  result.update(_find_hipsparse_config(rocm_install_path))
-  if result["rocm_version_number"] >= 40500:
-    result.update(_find_hipsolver_config(rocm_install_path))
-  result.update(_find_rocsolver_config(rocm_install_path))
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_rocm_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write("\nERROR: {}\n\n".format(str(e)))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/xla/third_party/gpus/local_config_cuda.BUILD b/third_party/xla/third_party/gpus/local_config_cuda.BUILD
deleted file mode 100644
index bed22cc5bd90f6..00000000000000
--- a/third_party/xla/third_party/gpus/local_config_cuda.BUILD
+++ /dev/null
@@ -1,60 +0,0 @@
-load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
-load("@local_config_cuda//cuda:build_defs.bzl", "enable_cuda_flag")
-
-package(default_visibility = ["//visibility:public"])
-
-# Build flag to enable CUDA support.
-#
-# Enable with '--@local_config_cuda//:enable_cuda', or indirectly with
-# ./configure or '--config=cuda'.
-enable_cuda_flag(
-    name = "enable_cuda",
-    build_setting_default = False,
-    enable_override = select({
-        ":define_using_cuda_nvcc": True,
-        "//conditions:default": False,
-    }),
-)
-
-# Config setting whether CUDA support has been requested.
-#
-# Enable path: ./configure > --config=cuda (.tf_configure.bazelrc)
-#     > --//tensorflow:enable_cuda (.bazelrc) > :is_cuda_enabled
-config_setting(
-    name = "is_cuda_enabled",
-    flag_values = {":enable_cuda": "True"},
-)
-
-# Build flag to select CUDA compiler.
-#
-# Set with '--@local_config_cuda//:cuda_compiler=...', or indirectly with
-# ./configure, '--config=cuda' or '--config=cuda_clang'.
-string_flag(
-    name = "cuda_compiler",
-    build_setting_default = "nvcc",
-    values = [
-        "clang",
-        "nvcc",
-    ],
-)
-
-# Config setting whether CUDA device code should be compiled with clang.
-config_setting(
-    name = "is_cuda_compiler_clang",
-    flag_values = {":cuda_compiler": "clang"},
-)
-
-# Config setting whether CUDA device code should be compiled with nvcc.
-config_setting(
-    name = "is_cuda_compiler_nvcc",
-    flag_values = {":cuda_compiler": "nvcc"},
-)
-
-# Config setting to keep `--define=using_cuda_nvcc=true` working.
-# TODO(b/174244321): Remove when downstream projects have been fixed, along
-# with the enable_cuda_flag rule in cuda:build_defs.bzl.tpl.
-config_setting(
-    name = "define_using_cuda_nvcc",
-    define_values = {"using_cuda_nvcc": "true"},
-    visibility = ["//visibility:private"],
-)
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD b/third_party/xla/third_party/gpus/rocm/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
deleted file mode 100644
index aa3688e335df37..00000000000000
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ /dev/null
@@ -1,182 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_hipcc",
-    values = {
-        "define": "using_rocm_hipcc=true",
-    },
-)
-
-cc_library(
-    name = "rocm_headers",
-    hdrs = [
-        "rocm/rocm_config.h",
-        %{rocm_headers}
-    ],
-    includes = [
-        ".",
-        "rocm/include",
-        "rocm/include/rocrand",
-        "rocm/include/roctracer",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hip",
-    srcs = ["rocm/lib/%{hip_lib}"],
-    data = ["rocm/lib/%{hip_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocblas",
-    srcs = ["rocm/lib/%{rocblas_lib}"],
-    data = ["rocm/lib/%{rocblas_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "%{hipfft_or_rocfft}",
-    srcs = ["rocm/lib/%{hipfft_or_rocfft_lib}"],
-    data = ["rocm/lib/%{hipfft_or_rocfft_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hiprand",
-    srcs = ["rocm/lib/%{hiprand_lib}"],
-    data = ["rocm/lib/%{hiprand_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-        "rocm/include/rocrand",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "miopen",
-    srcs = ["rocm/lib/%{miopen_lib}"],
-    data = ["rocm/lib/%{miopen_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rccl",
-    srcs = ["rocm/lib/%{rccl_lib}"],
-    data = ["rocm/lib/%{rccl_lib}"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocm",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rocm_headers",
-        ":hip",
-        ":rocblas",
-        ":hipblas",
-        ":%{hipfft_or_rocfft}",
-        ":hiprand",
-        ":miopen",
-        ":hipsparse",
-        ":roctracer",
-        ":rocsolver",
-        ":hipsolver",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-)
-
-cc_library(
-    name = "rocprim",
-    srcs = [
-        "rocm/include/hipcub/hipcub_version.hpp",
-        "rocm/include/rocprim/rocprim_version.hpp",
-    ],
-    hdrs = glob([
-        "rocm/include/hipcub/**",
-        "rocm/include/rocprim/**",
-    ]),
-    includes = [
-        ".",
-        "rocm/include/hipcub",
-        "rocm/include/rocprim",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_rocm//rocm:rocm_headers",
-    ],
-)
-
-cc_library(
-    name = "hipsparse",
-    srcs = ["rocm/lib/%{hipsparse_lib}"],
-    data = ["rocm/lib/%{hipsparse_lib}"],
-)
-
-cc_library(
-    name = "roctracer",
-    data = ["rocm/lib/%{roctracer_lib}"],
-)
-
-cc_library(
-    name = "rocsolver",
-    srcs = ["rocm/lib/%{rocsolver_lib}"],
-    data = ["rocm/lib/%{rocsolver_lib}"],
-)
-
-cc_library(
-    name = "hipsolver",
-    srcs = ["rocm/lib/%{hipsolver_lib}"],
-    data = ["rocm/lib/%{hipsolver_lib}"],
-)
-
-cc_library(
-    name = "hipblas",
-    srcs = ["rocm/lib/%{hipblas_lib}"],
-    data = ["rocm/lib/%{hipblas_lib}"],
-)
-
-filegroup(
-    name = "rocm_root",
-    srcs = [
-        "rocm/bin/clang-offload-bundler",
-    ],
-)
-
-%{copy_rules}
diff --git a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
deleted file mode 100644
index 2b4595bb222885..00000000000000
--- a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
+++ /dev/null
@@ -1,61 +0,0 @@
-# Macros for building ROCm code.
-def if_rocm(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with ROCm.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with ROCm enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_rocm//rocm:using_hipcc": if_true,
-        "//conditions:default": if_false
-    })
-
-
-def rocm_default_copts():
-    """Default options for all ROCm compilations."""
-    return if_rocm(["-x", "rocm"] + %{rocm_extra_copts})
-
-def rocm_copts(opts = []):
-    """Gets the appropriate set of copts for (maybe) ROCm compilation.
-
-      If we're doing ROCm compilation, returns copts for our particular ROCm
-      compiler.  If we're not doing ROCm compilation, returns an empty list.
-
-      """
-    return rocm_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_rocm//rocm:using_hipcc": ([
-            "",
-        ]),
-    }) + if_rocm_is_configured(opts)
-
-def rocm_gpu_architectures():
-    """Returns a list of supported GPU architectures."""
-    return %{rocm_gpu_architectures}
-
-def rocm_version_number():
-    """Returns a list of supported GPU architectures."""
-    return %{rocm_version_number}
-
-def if_rocm_is_configured(x):
-    """Tests if the ROCm was enabled during the configure process.
-
-    Unlike if_rocm(), this does not require that we are building with
-    --config=rocm. Used to allow non-ROCm code to depend on ROCm libraries.
-    """
-    if %{rocm_is_configured}:
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def rocm_hipblaslt():
-    return %{rocm_is_configured} and %{rocm_hipblaslt}
-
-def if_rocm_hipblaslt(x):
-    if %{rocm_is_configured} and (%{rocm_hipblaslt} == "True"):
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def rocm_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default ROCm options."""
-    native.cc_library(copts = rocm_default_copts() + copts, **kwargs)
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
deleted file mode 100644
index c461ccf8257659..00000000000000
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ /dev/null
@@ -1,851 +0,0 @@
-"""Repository rule for ROCm autoconfiguration.
-
-`rocm_configure` depends on the following environment variables:
-
-  * `TF_NEED_ROCM`: Whether to enable building with ROCm.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `ROCM_PATH`: The path to the ROCm toolkit. Default is `/opt/rocm`.
-  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
-"""
-
-load(
-    ":cuda_configure.bzl",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-    "to_list_of_strings",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "err_out",
-    "execute",
-    "files_exist",
-    "get_bash_bin",
-    "get_cpu_value",
-    "get_host_environ",
-    "get_python_bin",
-    "raw_exec",
-    "realpath",
-    "which",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_ROCM_TOOLKIT_PATH = "ROCM_PATH"
-_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
-_TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
-
-_DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
-
-def verify_build_defines(params):
-    """Verify all variables that crosstool/BUILD.rocm.tpl expects are substituted.
-
-    Args:
-      params: dict of variables that will be passed to the BUILD.tpl template.
-    """
-    missing = []
-    for param in [
-        "cxx_builtin_include_directories",
-        "extra_no_canonical_prefixes_flags",
-        "host_compiler_path",
-        "host_compiler_prefix",
-        "linker_bin_path",
-        "unfiltered_compile_flags",
-    ]:
-        if ("%{" + param + "}") not in params:
-            missing.append(param)
-
-    if missing:
-        auto_configure_fail(
-            "BUILD.rocm.tpl template is missing these variables: " +
-            str(missing) +
-            ".\nWe only got: " +
-            str(params) +
-            ".",
-        )
-
-def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-
-    # Return a dummy value for GCC detection here to avoid error
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = which(repository_ctx, cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    return path
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-
-    # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
-    #       but in rocm_clang CROSSTOOL file that is a `feature` and we should
-    #       handle the case when it's disabled and no flag is passed
-    result = raw_exec(repository_ctx, [
-        cc,
-        "-no-canonical-prefixes",
-        "-E",
-        "-x" + lang,
-        "-",
-        "-v",
-    ])
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    return [
-        str(repository_ctx.path(_cxx_inc_convert(p)))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set.to_list()
-    ]
-
-def auto_configure_fail(msg):
-    """Output failure message when rocm configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sROCm Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def auto_configure_warning(msg):
-    """Output warning message during auto configuration."""
-    yellow = "\033[1;33m"
-    no_color = "\033[0m"
-    print("\n%sAuto-Configuration Warning:%s %s\n" % (yellow, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
-    """Generates the cxx_builtin_include_directory entries for rocm inc dirs.
-
-    Args:
-      repository_ctx: The repository context.
-      rocm_config: The path to the gcc host compiler.
-
-    Returns:
-      A string containing the Starlark string for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-    inc_dirs = []
-
-    # Add HSA headers (needs to match $HSA_PATH)
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hsa/include")
-
-    # Add HIP headers (needs to match $HIP_PATH)
-    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include")
-    if int(rocm_config.rocm_version_number) >= 50200:
-        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include")
-        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip")
-        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocprim")
-        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocsolver")
-        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocblas")
-
-    # Add HIP-Clang headers (realpath relative to compiler binary)
-    rocm_toolkit_path = realpath(repository_ctx, rocm_config.rocm_toolkit_path, bash_bin)
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/8.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/9.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/12.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/13.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/14.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/16.0.0/include")
-    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17.0.0/include")
-
-    # Support hcc based off clang 10.0.0 (for ROCm 3.3)
-    inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
-    inc_dirs.append(rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
-
-    # Add hcc headers
-    inc_dirs.append(rocm_toolkit_path + "/hcc/include")
-
-    return inc_dirs
-
-def _enable_rocm(repository_ctx):
-    enable_rocm = get_host_environ(repository_ctx, "TF_NEED_ROCM")
-    if enable_rocm == "1":
-        if get_cpu_value(repository_ctx) != "Linux":
-            auto_configure_warning("ROCm configure is only supported on Linux")
-            return False
-        return True
-    return False
-
-def _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin):
-    """Returns a list of strings representing AMDGPU targets."""
-    amdgpu_targets_str = get_host_environ(repository_ctx, _TF_ROCM_AMDGPU_TARGETS)
-    if not amdgpu_targets_str:
-        cmd = "%s/bin/rocm_agent_enumerator" % rocm_toolkit_path
-        result = execute(repository_ctx, [bash_bin, "-c", cmd])
-        targets = [target for target in result.stdout.strip().split("\n") if target != "gfx000"]
-        targets = {x: None for x in targets}
-        targets = list(targets.keys())
-        amdgpu_targets_str = ",".join(targets)
-    amdgpu_targets = amdgpu_targets_str.split(",")
-    for amdgpu_target in amdgpu_targets:
-        if amdgpu_target[:3] != "gfx":
-            auto_configure_fail("Invalid AMDGPU target: %s" % amdgpu_target)
-    return amdgpu_targets
-
-def _hipcc_env(repository_ctx):
-    """Returns the environment variable string for hipcc.
-
-    Args:
-        repository_ctx: The repository context.
-
-    Returns:
-        A string containing environment variables for hipcc.
-    """
-    hipcc_env = ""
-    for name in [
-        "HIP_CLANG_PATH",
-        "DEVICE_LIB_PATH",
-        "HIP_VDI_HOME",
-        "HIPCC_VERBOSE",
-        "HIPCC_COMPILE_FLAGS_APPEND",
-        "HIPPCC_LINK_FLAGS_APPEND",
-        "HCC_AMDGPU_TARGET",
-        "HIP_PLATFORM",
-    ]:
-        env_value = get_host_environ(repository_ctx, name)
-        if env_value:
-            hipcc_env = (hipcc_env + " " + name + "=\"" + env_value + "\";")
-    return hipcc_env.strip()
-
-def _crosstool_verbose(repository_ctx):
-    """Returns the environment variable value CROSSTOOL_VERBOSE.
-
-    Args:
-        repository_ctx: The repository context.
-
-    Returns:
-        A string containing value of environment variable CROSSTOOL_VERBOSE.
-    """
-    return get_host_environ(repository_ctx, "CROSSTOOL_VERBOSE", "0")
-
-def _lib_name(lib, version = "", static = False):
-    """Constructs the name of a library on Linux.
-
-    Args:
-      lib: The name of the library, such as "hip"
-      version: The version of the library.
-      static: True the library is static or False if it is a shared object.
-
-    Returns:
-      The platform-specific name of the library.
-    """
-    if static:
-        return "lib%s.a" % lib
-    else:
-        if version:
-            version = ".%s" % version
-        return "lib%s.so%s" % (lib, version)
-
-def _rocm_lib_paths(repository_ctx, lib, basedir):
-    file_name = _lib_name(lib, version = "", static = False)
-    return [
-        repository_ctx.path("%s/lib64/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib64/stubs/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib/x86_64-linux-gnu/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/%s" % (basedir, file_name)),
-    ]
-
-def _batch_files_exist(repository_ctx, libs_paths, bash_bin):
-    all_paths = []
-    for row in libs_paths:
-        lib_paths = row[1]
-        for lib_path in lib_paths:
-            all_paths.append(lib_path)
-    return files_exist(repository_ctx, all_paths, bash_bin)
-
-def _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin):
-    test_results = _batch_files_exist(repository_ctx, libs_paths, bash_bin)
-
-    libs = {}
-    i = 0
-    for row in libs_paths:
-        name = row[0]
-        lib_paths = row[1]
-        optional = (len(row) > 2 and row[2] == True)
-        selected_path = None
-        for path in lib_paths:
-            if test_results[i] and selected_path == None:
-                # For each lib select the first path that exists.
-                selected_path = path
-            i = i + 1
-        if selected_path == None:
-            if optional:
-                libs[name] = None
-                continue
-            else:
-                auto_configure_fail("Cannot find rocm library %s" % name)
-
-        libs[name] = struct(file_name = selected_path.basename, path = realpath(repository_ctx, selected_path, bash_bin))
-
-    return libs
-
-def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin):
-    """Returns the ROCm libraries on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      rocm_config: The ROCm config as returned by _get_rocm_config
-      bash_bin: the path to the bash interpreter
-
-    Returns:
-      Map of library names to structs of filename and path
-    """
-    libs_paths = [
-        (name, _rocm_lib_paths(repository_ctx, name, path))
-        for name, path in [
-            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
-            ("rocblas", rocm_config.rocm_toolkit_path),
-            (hipfft_or_rocfft, rocm_config.rocm_toolkit_path),
-            ("hiprand", rocm_config.rocm_toolkit_path),
-            ("MIOpen", miopen_path),
-            ("rccl", rccl_path),
-            ("hipsparse", rocm_config.rocm_toolkit_path),
-            ("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
-            ("rocsolver", rocm_config.rocm_toolkit_path),
-        ]
-    ]
-    if int(rocm_config.rocm_version_number) >= 40500:
-        libs_paths.append(("hipsolver", _rocm_lib_paths(repository_ctx, "hipsolver", rocm_config.rocm_toolkit_path)))
-        libs_paths.append(("hipblas", _rocm_lib_paths(repository_ctx, "hipblas", rocm_config.rocm_toolkit_path)))
-
-    # hipblaslt may be absent even in versions of ROCm where it exists
-    # (it is not installed by default in some containers). Autodetect.
-    libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
-    return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
-
-def find_rocm_config(repository_ctx):
-    """Returns ROCm config dictionary from running find_rocm_config.py"""
-    python_bin = get_python_bin(repository_ctx)
-    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_rocm_config])
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_rocm_config.py: %s" % err_out(exec_result))
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _get_rocm_config(repository_ctx, bash_bin):
-    """Detects and returns information about the ROCm installation on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      bash_bin: the path to the path interpreter
-
-    Returns:
-      A struct containing the following fields:
-        rocm_toolkit_path: The ROCm toolkit installation directory.
-        amdgpu_targets: A list of the system's AMDGPU targets.
-        rocm_version_number: The version of ROCm on the system.
-        miopen_version_number: The version of MIOpen on the system.
-        hipruntime_version_number: The version of HIP Runtime on the system.
-    """
-    config = find_rocm_config(repository_ctx)
-    rocm_toolkit_path = config["rocm_toolkit_path"]
-    rocm_version_number = config["rocm_version_number"]
-    miopen_version_number = config["miopen_version_number"]
-    hipruntime_version_number = config["hipruntime_version_number"]
-    return struct(
-        amdgpu_targets = _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin),
-        rocm_toolkit_path = rocm_toolkit_path,
-        rocm_version_number = rocm_version_number,
-        miopen_version_number = miopen_version_number,
-        hipruntime_version_number = hipruntime_version_number,
-    )
-
-def _tpl_path(repository_ctx, labelname):
-    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % labelname))
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        _tpl_path(repository_ctx, tpl),
-        substitutions,
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=rocm but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for rocm/.
-    _tpl(
-        repository_ctx,
-        "rocm:build_defs.bzl",
-        {
-            "%{rocm_is_configured}": "False",
-            "%{rocm_extra_copts}": "[]",
-            "%{rocm_gpu_architectures}": "[]",
-            "%{rocm_version_number}": "0",
-            "%{rocm_hipblaslt}": "False",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "rocm:BUILD",
-        {
-            "%{hip_lib}": _lib_name("hip"),
-            "%{rocblas_lib}": _lib_name("rocblas"),
-            "%{hipblas_lib}": _lib_name("hipblas"),
-            "%{miopen_lib}": _lib_name("miopen"),
-            "%{rccl_lib}": _lib_name("rccl"),
-            "%{hipfft_or_rocfft}": _lib_name("hipfft"),
-            "%{hipfft_or_rocfft_lib}": _lib_name("hipfft"),
-            "%{hiprand_lib}": _lib_name("hiprand"),
-            "%{hipsparse_lib}": _lib_name("hipsparse"),
-            "%{roctracer_lib}": _lib_name("roctracer64"),
-            "%{rocsolver_lib}": _lib_name("rocsolver"),
-            "%{hipsolver_lib}": _lib_name("hipsolver"),
-            "%{hipblaslt_lib}": _lib_name("hipblaslt"),
-            "%{copy_rules}": "",
-            "%{rocm_headers}": "",
-        },
-    )
-
-    # Create dummy files for the ROCm toolkit since they are still required by
-    # tensorflow/compiler/xla/stream_executor/rocm:rocm_rpath
-    repository_ctx.file("rocm/hip/include/hip/hip_runtime.h", "")
-
-    # Set up rocm_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "rocm:rocm_config.h",
-        {
-            "%{rocm_toolkit_path}": _DEFAULT_ROCM_TOOLKIT_PATH,
-            "%{hipblaslt_flag}": "0",
-        },
-        "rocm/rocm/rocm_config.h",
-    )
-
-    # If rocm_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=rocm, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
-def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
-    amdgpu_target_flags = ["--amdgpu-target=" +
-                           amdgpu_target for amdgpu_target in amdgpu_targets]
-    return str(amdgpu_target_flags)
-
-def _create_local_rocm_repository(repository_ctx):
-    """Creates the repository containing files set up to build with ROCm."""
-
-    tpl_paths = {labelname: _tpl_path(repository_ctx, labelname) for labelname in [
-        "rocm:build_defs.bzl",
-        "rocm:BUILD",
-        "crosstool:BUILD.rocm",
-        "crosstool:hipcc_cc_toolchain_config.bzl",
-        "crosstool:clang/bin/crosstool_wrapper_driver_rocm",
-        "rocm:rocm_config.h",
-    ]}
-
-    bash_bin = get_bash_bin(repository_ctx)
-    rocm_config = _get_rocm_config(repository_ctx, bash_bin)
-
-    # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
-    rocm_version_number = int(rocm_config.rocm_version_number)
-    hipfft_or_rocfft = "rocfft" if rocm_version_number < 40100 else "hipfft"
-
-    # For ROCm 5.2 and above, find MIOpen and RCCL in the main rocm lib path
-    miopen_path = rocm_config.rocm_toolkit_path + "/miopen" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-    rccl_path = rocm_config.rocm_toolkit_path + "/rccl" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-
-    # Copy header and library files to execroot.
-    # rocm_toolkit_path
-    rocm_toolkit_path = rocm_config.rocm_toolkit_path
-    copy_rules = [
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "rocm-include",
-            src_dir = rocm_toolkit_path + "/include",
-            out_dir = "rocm/include",
-        ),
-    ]
-
-    # explicitly copy (into the local_config_rocm repo) the $ROCM_PATH/hiprand/include and
-    # $ROCM_PATH/rocrand/include dirs, only once the softlink to them in $ROCM_PATH/include
-    # dir has been removed. This removal will happen in a near-future ROCm release.
-    hiprand_include = ""
-    hiprand_include_softlink = rocm_config.rocm_toolkit_path + "/include/hiprand"
-    softlink_exists = files_exist(repository_ctx, [hiprand_include_softlink], bash_bin)
-    if not softlink_exists[0]:
-        hiprand_include = '":hiprand-include",\n'
-        copy_rules.append(
-            make_copy_dir_rule(
-                repository_ctx,
-                name = "hiprand-include",
-                src_dir = rocm_toolkit_path + "/hiprand/include",
-                out_dir = "rocm/include/hiprand",
-            ),
-        )
-
-    rocrand_include = ""
-    rocrand_include_softlink = rocm_config.rocm_toolkit_path + "/include/rocrand"
-    softlink_exists = files_exist(repository_ctx, [rocrand_include_softlink], bash_bin)
-    if not softlink_exists[0]:
-        rocrand_include = '":rocrand-include",\n'
-        copy_rules.append(
-            make_copy_dir_rule(
-                repository_ctx,
-                name = "rocrand-include",
-                src_dir = rocm_toolkit_path + "/rocrand/include",
-                out_dir = "rocm/include/rocrand",
-            ),
-        )
-
-    rocm_libs = _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin)
-    rocm_lib_srcs = []
-    rocm_lib_outs = []
-    for lib in rocm_libs.values():
-        if lib:
-            rocm_lib_srcs.append(lib.path)
-            rocm_lib_outs.append("rocm/lib/" + lib.file_name)
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "rocm-lib",
-        srcs = rocm_lib_srcs,
-        outs = rocm_lib_outs,
-    ))
-
-    clang_offload_bundler_path = rocm_toolkit_path + "/llvm/bin/clang-offload-bundler"
-
-    # copy files mentioned in third_party/gpus/rocm/BUILD
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "rocm-bin",
-        srcs = [
-            clang_offload_bundler_path,
-        ],
-        outs = [
-            "rocm/bin/" + "clang-offload-bundler",
-        ],
-    ))
-
-    have_hipblaslt = "1" if rocm_libs["hipblaslt"] != None else "0"
-
-    # Set up BUILD file for rocm/
-    repository_ctx.template(
-        "rocm/build_defs.bzl",
-        tpl_paths["rocm:build_defs.bzl"],
-        {
-            "%{rocm_is_configured}": "True",
-            "%{rocm_extra_copts}": _compute_rocm_extra_copts(
-                repository_ctx,
-                rocm_config.amdgpu_targets,
-            ),
-            "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
-            "%{rocm_version_number}": str(rocm_version_number),
-            "%{rocm_hipblaslt}": "True" if rocm_libs["hipblaslt"] != None else "False",
-        },
-    )
-
-    repository_dict = {
-        "%{hip_lib}": rocm_libs["amdhip64"].file_name,
-        "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
-        "%{hipfft_or_rocfft}": hipfft_or_rocfft,
-        "%{hipfft_or_rocfft_lib}": rocm_libs[hipfft_or_rocfft].file_name,
-        "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
-        "%{miopen_lib}": rocm_libs["MIOpen"].file_name,
-        "%{rccl_lib}": rocm_libs["rccl"].file_name,
-        "%{hipsparse_lib}": rocm_libs["hipsparse"].file_name,
-        "%{roctracer_lib}": rocm_libs["roctracer64"].file_name,
-        "%{rocsolver_lib}": rocm_libs["rocsolver"].file_name,
-        "%{copy_rules}": "\n".join(copy_rules),
-        "%{rocm_headers}": ('":rocm-include",\n' +
-                            hiprand_include +
-                            rocrand_include),
-    }
-    if rocm_libs["hipblaslt"] != None:
-        repository_dict["%{hipblaslt_lib}"] = rocm_libs["hipblaslt"].file_name
-
-    if rocm_version_number >= 40500:
-        repository_dict["%{hipsolver_lib}"] = rocm_libs["hipsolver"].file_name
-        repository_dict["%{hipblas_lib}"] = rocm_libs["hipblas"].file_name
-
-    repository_ctx.template(
-        "rocm/BUILD",
-        tpl_paths["rocm:BUILD"],
-        repository_dict,
-    )
-
-    # Set up crosstool/
-
-    cc = find_cc(repository_ctx)
-
-    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
-
-    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
-
-    rocm_defines = {}
-
-    rocm_defines["%{host_compiler_prefix}"] = host_compiler_prefix
-
-    rocm_defines["%{linker_bin_path}"] = rocm_config.rocm_toolkit_path + "/hcc/compiler/bin"
-
-    # For gcc, do not canonicalize system header paths; some versions of gcc
-    # pick the shortest possible path for system includes when creating the
-    # .d file - given that includes that are prefixed with "../" multiple
-    # time quickly grow longer than the root of the tree, this can lead to
-    # bazel's header check failing.
-    rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-
-    rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
-        "-DTENSORFLOW_USE_ROCM=1",
-        "-D__HIP_PLATFORM_HCC__",
-        "-DEIGEN_USE_HIP",
-    ])
-
-    rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-
-    rocm_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-        host_compiler_includes + _rocm_include_path(repository_ctx, rocm_config, bash_bin),
-    )
-
-    verify_build_defines(rocm_defines)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        tpl_paths["crosstool:BUILD.rocm"],
-        rocm_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        tpl_paths["crosstool:hipcc_cc_toolchain_config.bzl"],
-    )
-
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_rocm"],
-        {
-            "%{cpu_compiler}": str(cc),
-            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc",
-            "%{hipcc_env}": _hipcc_env(repository_ctx),
-            "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
-            "%{rocr_runtime_library}": "hsa-runtime64",
-            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
-            "%{hip_runtime_library}": "amdhip64",
-            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
-            "%{gcc_host_compiler_path}": str(cc),
-        },
-    )
-
-    # Set up rocm_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "rocm/rocm/rocm_config.h",
-        tpl_paths["rocm:rocm_config.h"],
-        {
-            "%{rocm_amdgpu_targets}": ",".join(
-                ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
-            ),
-            "%{rocm_toolkit_path}": rocm_config.rocm_toolkit_path,
-            "%{rocm_version_number}": rocm_config.rocm_version_number,
-            "%{miopen_version_number}": rocm_config.miopen_version_number,
-            "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
-        },
-    )
-
-def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with ROCm."""
-    _tpl(
-        repository_ctx,
-        "rocm:build_defs.bzl",
-        {
-            "%{rocm_is_configured}": "True",
-            "%{rocm_extra_copts}": _compute_rocm_extra_copts(
-                repository_ctx,
-                [],  #_compute_capabilities(repository_ctx)
-            ),
-        },
-    )
-    repository_ctx.template(
-        "rocm/BUILD",
-        config_repo_label(remote_config_repo, "rocm:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "rocm/build_defs.bzl",
-        config_repo_label(remote_config_repo, "rocm:build_defs.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "rocm/rocm/rocm_config.h",
-        config_repo_label(remote_config_repo, "rocm:rocm/rocm_config.h"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/BUILD",
-        config_repo_label(remote_config_repo, "crosstool:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
-        {},
-    )
-
-def _rocm_autoconf_impl(repository_ctx):
-    """Implementation of the rocm_autoconf repository rule."""
-    if not _enable_rocm(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO) != None:
-        _create_remote_rocm_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO),
-        )
-    else:
-        _create_local_rocm_repository(repository_ctx)
-
-_ENVIRONS = [
-    _GCC_HOST_COMPILER_PATH,
-    _GCC_HOST_COMPILER_PREFIX,
-    "TF_NEED_ROCM",
-    _ROCM_TOOLKIT_PATH,
-    _TF_ROCM_AMDGPU_TARGETS,
-]
-
-remote_rocm_configure = repository_rule(
-    implementation = _create_local_rocm_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_rocm_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
-        ),
-    },
-)
-
-rocm_configure = repository_rule(
-    implementation = _rocm_autoconf_impl,
-    environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
-    attrs = {
-        "_find_rocm_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
-        ),
-    },
-)
-"""Detects and configures the local ROCm toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-rocm_configure(name = "local_config_rocm")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/grpc/BUILD b/third_party/xla/third_party/grpc/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/grpc/generate_cc_env_fix.patch b/third_party/xla/third_party/grpc/generate_cc_env_fix.patch
deleted file mode 100644
index 51832fe9628bb3..00000000000000
--- a/third_party/xla/third_party/grpc/generate_cc_env_fix.patch
+++ /dev/null
@@ -1,10 +0,0 @@
---- a/bazel/generate_cc.bzl
-+++ b/bazel/generate_cc.bzl
-@@ -141,6 +141,7 @@ def generate_cc_impl(ctx):
-         outputs = out_files,
-         executable = ctx.executable._protoc,
-         arguments = arguments,
-+        use_default_shell_env = True,
-     )
-
-     return struct(files = depset(out_files))
diff --git a/third_party/xla/third_party/grpc/register_go_toolchain.patch b/third_party/xla/third_party/grpc/register_go_toolchain.patch
deleted file mode 100644
index eabe6ccbe2be6f..00000000000000
--- a/third_party/xla/third_party/grpc/register_go_toolchain.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/bazel/grpc_extra_deps.bzl b/bazel/grpc_extra_deps.bzl
-index 4c1dfad2e8..d3d9ce15ba 100644
---- a/bazel/grpc_extra_deps.bzl
-+++ b/bazel/grpc_extra_deps.bzl
-@@ -33,7 +33,7 @@ def grpc_extra_deps():
-     api_dependencies()
-
-     go_rules_dependencies()
--    go_register_toolchains()
-+    go_register_toolchains(version = "1.18.4")
-
-     apple_rules_dependencies()
-
diff --git a/third_party/xla/third_party/grpc/upb_platform_fix.patch b/third_party/xla/third_party/grpc/upb_platform_fix.patch
deleted file mode 100644
index 6edd66067ea63a..00000000000000
--- a/third_party/xla/third_party/grpc/upb_platform_fix.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/BUILD b/BUILD
-index ad85b202..2311b2e4 100644
---- a/BUILD
-+++ b/BUILD
-@@ -44,7 +44,7 @@ config_setting(
-
- config_setting(
-     name = "windows",
--    constraint_values = ["@bazel_tools//platforms:windows"],
-+    constraint_values = ["@platforms//os:windows"],
- )
-
- config_setting(
diff --git a/third_party/xla/third_party/hwloc/BUILD b/third_party/xla/third_party/hwloc/BUILD
deleted file mode 100644
index 3848c0818e77db..00000000000000
--- a/third_party/xla/third_party/hwloc/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-# BUILD file to make this directory a package.
-
-package(
-    default_visibility = ["//visibility:public"],
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-exports_files(
-    ["static-components.h"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/hwloc/BUILD.system b/third_party/xla/third_party/hwloc/BUILD.system
deleted file mode 100644
index 2989102a1d96c9..00000000000000
--- a/third_party/xla/third_party/hwloc/BUILD.system
+++ /dev/null
@@ -1,22 +0,0 @@
-# hwloc: Portable Hardware Locality Library
-
-licenses(["notice"])
-
-config_setting(
-    name = "with_numa_support",
-    define_values = {"with_numa_support": "true"},
-)
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hwloc",
-    linkopts = select({
-        ":with_numa_support": ["-lhwloc"],
-        "//conditions:default": [],
-    }),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/hwloc/hwloc.BUILD
deleted file mode 100644
index 57f32c3c5064b4..00000000000000
--- a/third_party/xla/third_party/hwloc/hwloc.BUILD
+++ /dev/null
@@ -1,316 +0,0 @@
-# hwloc: Portable Hardware Locality Library
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["COPYING"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-COMMON_INCLUDE_COPTS = [
-    "-I.",
-    "-Ihwloc",
-    "-Iinclude",
-]
-
-DISABLE_WARNINGS_COPTS = [
-    "-Wno-vla",
-]
-
-VAR_SETTINGS_COPTS = [
-    "-DHWLOC_DUMPED_HWDATA_DIR=",
-    "-DRUNSTATEDIR=",
-]
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
-    "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
-    "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
-    "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
-    "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
-    "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
-    "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
-    "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
-    "#  undef HWLOC_HAVE_STDINT_H": "#  define HWLOC_HAVE_STDINT_H 1",
-    "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
-    "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
-    "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
-}
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS = dict(_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS)
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update({
-    "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
-})
-
-expand_template(
-    name = "include_hwloc_autogen_config_h",
-    out = "include/hwloc/autogen/config.h",
-    substitutions = select({
-        "@local_tsl//tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
-        "//conditions:default": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
-    }),
-    template = "include/hwloc/autogen/config.h.in",
-)
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
-    "#undef HAVE_CLOCK_GETTIME": "#define HAVE_CLOCK_GETTIME 1",
-    "#undef HAVE_CTYPE_H": "#define HAVE_CTYPE_H 1",
-    "#undef HAVE_DECL_CTL_HW": "#define HAVE_DECL_CTL_HW 0",
-    "#undef HAVE_DECL_FABSF": "#define HAVE_DECL_FABSF 1",
-    "#undef HAVE_DECL_GETEXECNAME": "#define HAVE_DECL_GETEXECNAME 0",
-    "#undef HAVE_DECL_GETMODULEFILENAME": "#define HAVE_DECL_GETMODULEFILENAME 0",
-    "#undef HAVE_DECL_GETPROGNAME": "#define HAVE_DECL_GETPROGNAME 0",
-    "#undef HAVE_DECL_HW_NCPU": "#define HAVE_DECL_HW_NCPU 0",
-    "#undef HAVE_DECL_MODFF": "#define HAVE_DECL_MODFF 1",
-    "#undef HAVE_DECL_PTHREAD_GETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1",
-    "#undef HAVE_DECL_PTHREAD_SETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1",
-    "#undef HAVE_DECL_RUNNING_ON_VALGRIND": "#define HAVE_DECL_RUNNING_ON_VALGRIND 0",
-    "#undef HAVE_DECL_SCHED_GETCPU": "#define HAVE_DECL_SCHED_GETCPU 1",
-    "#undef HAVE_DECL_SNPRINTF": "#define HAVE_DECL_SNPRINTF 1",
-    "#undef HAVE_DECL_STRTOULL": "#define HAVE_DECL_STRTOULL 1",
-    "#undef HAVE_DECL__PUTENV": "#define HAVE_DECL__PUTENV 0",
-    "#undef HAVE_DECL__SC_LARGE_PAGESIZE": "#define HAVE_DECL__SC_LARGE_PAGESIZE 0",
-    "#undef HAVE_DECL__SC_NPROCESSORS_CONF": "#define HAVE_DECL__SC_NPROCESSORS_CONF 1",
-    "#undef HAVE_DECL__SC_NPROCESSORS_ONLN": "#define HAVE_DECL__SC_NPROCESSORS_ONLN 1",
-    "#undef HAVE_DECL__SC_NPROC_CONF": "#define HAVE_DECL__SC_NPROC_CONF 0",
-    "#undef HAVE_DECL__SC_NPROC_ONLN": "#define HAVE_DECL__SC_NPROC_ONLN 0",
-    "#undef HAVE_DECL__SC_PAGESIZE": "#define HAVE_DECL__SC_PAGESIZE 1",
-    "#undef HAVE_DECL__SC_PAGE_SIZE": "#define HAVE_DECL__SC_PAGE_SIZE 1",
-    "#undef HAVE_DECL__STRDUP": "#define HAVE_DECL__STRDUP 0",
-    "#undef HAVE_DIRENT_H": "#define HAVE_DIRENT_H 1",
-    "#undef HAVE_DLFCN_H": "#define HAVE_DLFCN_H 1",
-    "#undef HAVE_FFSL": "#define HAVE_FFSL 1",
-    "#undef HAVE_FFS": "#define HAVE_FFS 1",
-    "#undef HAVE_GETPAGESIZE": "#define HAVE_GETPAGESIZE 1",
-    "#undef HAVE_INTTYPES_H": "#define HAVE_INTTYPES_H 1",
-    "#undef HAVE_LANGINFO_H": "#define HAVE_LANGINFO_H 1",
-    "#undef HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
-    "#undef HAVE_MALLOC_H": "#define HAVE_MALLOC_H 1",
-    "#undef HAVE_MEMALIGN": "#define HAVE_MEMALIGN 1",
-    "#undef HAVE_MEMORY_H": "#define HAVE_MEMORY_H 1",
-    "#undef HAVE_MKSTEMP": "#define HAVE_MKSTEMP 1",
-    "#undef HAVE_NL_LANGINFO": "#define HAVE_NL_LANGINFO 1",
-    "#undef HAVE_OPENAT": "#define HAVE_OPENAT 1",
-    "#undef HAVE_POSIX_MEMALIGN": "#define HAVE_POSIX_MEMALIGN 1",
-    "#undef HAVE_PTHREAD_T": "#define HAVE_PTHREAD_T 1",
-    "#undef HAVE_PUTWC": "#define HAVE_PUTWC 1",
-    "#undef HAVE_SETLOCALE": "#define HAVE_SETLOCALE 1",
-    "#undef HAVE_SSIZE_T": "#define HAVE_SSIZE_T 1",
-    "#undef HAVE_STDINT_H": "#define HAVE_STDINT_H 1",
-    "#undef HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
-    "#undef HAVE_STRCASECMP": "#define HAVE_STRCASECMP 1",
-    "#undef HAVE_STRFTIME": "#define HAVE_STRFTIME 1",
-    "#undef HAVE_STRINGS_H": "#define HAVE_STRINGS_H 1",
-    "#undef HAVE_STRING_H": "#define HAVE_STRING_H 1",
-    "#undef HAVE_STRNCASECMP": "#define HAVE_STRNCASECMP 1",
-    "#undef HAVE_SYS_MMAN_H": "#define HAVE_SYS_MMAN_H 1",
-    "#undef HAVE_SYS_PARAM_H": "#define HAVE_SYS_PARAM_H 1",
-    "#undef HAVE_SYS_STAT_H": "#define HAVE_SYS_STAT_H 1",
-    "#undef HAVE_SYS_SYSCTL_H": "#define HAVE_SYS_SYSCTL_H 1",
-    "#undef HAVE_SYS_TYPES_H": "#define HAVE_SYS_TYPES_H 1",
-    "#undef HAVE_SYS_UTSNAME_H": "#define HAVE_SYS_UTSNAME_H 1",
-    "#undef HAVE_TIME_H": "#define HAVE_TIME_H 1",
-    "#undef HAVE_UNAME": "#define HAVE_UNAME 1",
-    "#undef HAVE_UNISTD_H": "#define HAVE_UNISTD_H 1",
-    "#undef HAVE_USELOCALE": "#define HAVE_USELOCALE 1",
-    "#undef HAVE_WCHAR_T": "#define HAVE_WCHAR_T 1",
-    "#undef HAVE_X11_KEYSYM_H": "#define HAVE_X11_KEYSYM_H 1",
-    "#undef HAVE_X11_XLIB_H": "#define HAVE_X11_XLIB_H 1",
-    "#undef HAVE_X11_XUTIL_H": "#define HAVE_X11_XUTIL_H 1",
-    "#undef HAVE___PROGNAME": "#define HAVE___PROGNAME 1",
-    "#undef HWLOC_C_HAVE_VISIBILITY": "#define HWLOC_C_HAVE_VISIBILITY 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_ALIGNED": "#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE": "#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_COLD": "#define HWLOC_HAVE_ATTRIBUTE_COLD 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR": "#define HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_CONST": "#define HWLOC_HAVE_ATTRIBUTE_CONST 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_DEPRECATED": "#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_FORMAT": "#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_HOT": "#define HWLOC_HAVE_ATTRIBUTE_HOT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_MALLOC": "#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NONNULL": "#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NORETURN": "#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION": "#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_PACKED": "#define HWLOC_HAVE_ATTRIBUTE_PACKED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_PURE": "#define HWLOC_HAVE_ATTRIBUTE_PURE 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_SENTINEL": "#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_UNUSED": "#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT": "#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE": "#define HWLOC_HAVE_ATTRIBUTE 1",
-    "#undef HWLOC_HAVE_CPU_SET_S": "#define HWLOC_HAVE_CPU_SET_S 1",
-    "#undef HWLOC_HAVE_CPU_SET": "#define HWLOC_HAVE_CPU_SET 1",
-    "#undef HWLOC_HAVE_DECL_FFSL": "#define HWLOC_HAVE_DECL_FFSL 1",
-    "#undef HWLOC_HAVE_DECL_FFS": "#define HWLOC_HAVE_DECL_FFS 1",
-    "#undef HWLOC_HAVE_DECL_STRCASECMP": "#define HWLOC_HAVE_DECL_STRCASECMP 1",
-    "#undef HWLOC_HAVE_DECL_STRNCASECMP": "#define HWLOC_HAVE_DECL_STRNCASECMP 1",
-    "#undef HWLOC_HAVE_FFSL": "#define HWLOC_HAVE_FFSL 1",
-    "#undef HWLOC_HAVE_FFS": "#define HWLOC_HAVE_FFS 1",
-    "#undef HWLOC_HAVE_LIBTERMCAP": "#define HWLOC_HAVE_LIBTERMCAP 1",
-    "#undef HWLOC_HAVE_LINUXIO": "#define HWLOC_HAVE_LINUXIO 1",
-    "#undef HWLOC_HAVE_PTHREAD_MUTEX": "#define HWLOC_HAVE_PTHREAD_MUTEX 1",
-    "#undef HWLOC_HAVE_SCHED_SETAFFINITY": "#define HWLOC_HAVE_SCHED_SETAFFINITY 1",
-    "#undef HWLOC_HAVE_STDINT_H": "#define HWLOC_HAVE_STDINT_H 1",
-    "#undef HWLOC_HAVE_SYSCALL": "#define HWLOC_HAVE_SYSCALL 1",
-    "#undef HWLOC_HAVE_X11_KEYSYM": "#define HWLOC_HAVE_X11_KEYSYM 1",
-    "#undef HWLOC_HAVE_X86_CPUID": "#define HWLOC_HAVE_X86_CPUID 1",
-    "#undef HWLOC_SIZEOF_UNSIGNED_INT": "#define HWLOC_SIZEOF_UNSIGNED_INT 4",
-    "#undef HWLOC_SIZEOF_UNSIGNED_LONG": "#define HWLOC_SIZEOF_UNSIGNED_LONG 8",
-    "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
-    "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
-    "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
-    "#undef HWLOC_USE_NCURSES": "#define HWLOC_USE_NCURSES 1",
-    "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
-    "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
-    "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
-    "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
-    "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
-    "#undef HWLOC_X86_64_ARCH": "#define HWLOC_X86_64_ARCH 1",
-    "#undef LT_OBJDIR": "#define LT_OBJDIR \".libs/\"",
-    "#undef PACKAGE_BUGREPORT": "#define PACKAGE_BUGREPORT \"http://github.com/open-mpi/hwloc/issues",
-    "#undef PACKAGE_NAME": "#define PACKAGE_NAME \"hwloc\"",
-    "#undef PACKAGE_STRING": "#define PACKAGE_STRING \"hwloc 2.0.3\"",
-    "#undef PACKAGE_TARNAME": "#define PACKAGE_TARNAME \"hwloc\"",
-    "#undef PACKAGE_URL": "#define PACKAGE_URL \"\"",
-    "#undef PACKAGE_VERSION": "#define PACKAGE_VERSION \"2.0.3\"",
-    "#undef PACKAGE": "#define PACKAGE \"hwloc\"",
-    "#undef SIZEOF_UNSIGNED_INT": "#define SIZEOF_UNSIGNED_INT 4",
-    "#undef SIZEOF_UNSIGNED_LONG": "#define SIZEOF_UNSIGNED_LONG 8",
-    "#undef SIZEOF_VOID_P": "#define SIZEOF_VOID_P 8",
-    "#undef STDC_HEADERS": "#define STDC_HEADERS 1",
-    "# undef _HPUX_SOURCE": "# define _HPUX_SOURCE 1",
-    "# undef _ALL_SOURCE": "# define _ALL_SOURCE 1",
-    "# undef _GNU_SOURCE": "# define _GNU_SOURCE 1",
-    "# undef _POSIX_PTHREAD_SEMANTICS": "# define _POSIX_PTHREAD_SEMANTICS 1",
-    "# undef _TANDEM_SOURCE": "# define _TANDEM_SOURCE 1",
-    "# undef __EXTENSIONS__": "# define __EXTENSIONS__ 1",
-    "#undef VERSION": "#define VERSION \"2.0.3\"",
-    "#undef _HPUX_SOURCE": "#define _HPUX_SOURCE 1",
-    "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
-    "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS = {
-    "#undef HAVE_CUDA_RUNTIME_API_H": "#define HAVE_CUDA_RUNTIME_API_H 1",
-    "#undef HAVE_CUDA_H": "#define HAVE_CUDA_H 1",
-    "#undef HAVE_CUDA": "#define HAVE_CUDA 1",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS = {
-    "#undef HAVE_PROGRAM_INVOCATION_NAME": "#define HAVE_PROGRAM_INVOCATION_NAME 1",
-    "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update(_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS)
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS.update(_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS)
-
-expand_template(
-    name = "include_private_hwloc_autogen__config_h",
-    out = "include/private/autogen/config.h",
-    substitutions = if_cuda(
-        _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS,
-        if_false = _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
-    ),
-    template = "include/private/autogen/config.h.in",
-)
-
-expand_template(
-    name = "move_static_components_h",
-    out = "hwloc/static-components.h",
-    substitutions = {"&hwloc_linuxio_component": "//&hwloc_linuxio_component"},
-    template = "@local_xla//third_party/hwloc:static-components.h",
-)
-
-cc_library(
-    name = "hwloc",
-    srcs = [
-        "hwloc/base64.c",
-        "hwloc/bind.c",
-        "hwloc/bitmap.c",
-        "hwloc/components.c",
-        "hwloc/cpukinds.c",
-        "hwloc/diff.c",
-        "hwloc/distances.c",
-        "hwloc/memattrs.c",
-        "hwloc/misc.c",
-        "hwloc/pci-common.c",
-        "hwloc/shmem.c",
-        "hwloc/static-components.h",
-        "hwloc/topology.c",
-        "hwloc/topology-hardwired.c",
-        "hwloc/topology-noos.c",
-        "hwloc/topology-synthetic.c",
-        "hwloc/topology-xml.c",
-        "hwloc/topology-xml-nolibxml.c",
-        "hwloc/traversal.c",
-        "include/hwloc/plugins.h",
-        "include/hwloc/shmem.h",
-        "include/private/autogen/config.h",
-        "include/private/components.h",
-        "include/private/debug.h",
-        "include/private/internal-components.h",
-        "include/private/misc.h",
-        "include/private/private.h",
-        "include/private/xml.h",
-    ] + select({
-        "@local_tsl//tsl:linux_x86_64": [
-            "hwloc/topology-linux.c",
-            "hwloc/topology-x86.c",
-            "include/hwloc/linux.h",
-            "include/private/cpuid-x86.h",
-        ],
-        "@local_tsl//tsl:linux_aarch64": [
-            "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
-        ],
-        "@local_tsl//tsl:linux_ppc64le": [
-            "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
-        ],
-        "@local_tsl//tsl:freebsd": [
-            "hwloc/topology-freebsd.c",
-            "hwloc/topology-x86.c",
-            "include/private/cpuid-x86.h",
-        ],
-        "//conditions:default": [],
-    }),
-    hdrs = [
-        "include/hwloc.h",
-        "include/hwloc/autogen/config.h",
-        "include/hwloc/bitmap.h",
-        "include/hwloc/cpukinds.h",
-        "include/hwloc/deprecated.h",
-        "include/hwloc/diff.h",
-        "include/hwloc/distances.h",
-        "include/hwloc/export.h",
-        "include/hwloc/helper.h",
-        "include/hwloc/inlines.h",
-        "include/hwloc/memattrs.h",
-        "include/hwloc/rename.h",
-    ],
-    copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS,
-    features = [
-        "-parse_headers",
-        "-layering_check",
-    ],
-    includes = [
-        "hwloc",
-        "include",
-    ],
-    deps = [],
-)
-
-cc_binary(
-    name = "hwloc_print",
-    srcs = ["hwloc_print.cc"],
-    copts = COMMON_INCLUDE_COPTS,
-    deps = [
-        ":hwloc",
-    ],
-)
diff --git a/third_party/xla/third_party/hwloc/static-components.h b/third_party/xla/third_party/hwloc/static-components.h
deleted file mode 100644
index e83b311ee11678..00000000000000
--- a/third_party/xla/third_party/hwloc/static-components.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
-#define TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
-
-#include <private/internal-components.h>
-static const struct hwloc_component* hwloc_static_components[] = {
-    &hwloc_noos_component,
-    &hwloc_xml_component,
-    &hwloc_synthetic_component,
-    &hwloc_xml_nolibxml_component,
-#ifdef __linux__
-    &hwloc_linux_component,
-    &hwloc_linuxio_component,
-#endif
-#ifdef __FreeBSD__
-    &hwloc_freebsd_component,
-#endif
-#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
-    defined(_M_X64)
-    &hwloc_x86_component,
-#endif
-    NULL};
-
-#endif  // TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
diff --git a/third_party/xla/third_party/hwloc/workspace.bzl b/third_party/xla/third_party/hwloc/workspace.bzl
deleted file mode 100644
index ce8475cd00ccdc..00000000000000
--- a/third_party/xla/third_party/hwloc/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""loads the hwloc library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "hwloc",
-        urls = tf_mirror_urls("https://download.open-mpi.org/release/hwloc/v2.7/hwloc-2.7.1.tar.gz"),
-        sha256 = "4cb0a781ed980b03ad8c48beb57407aa67c4b908e45722954b9730379bc7f6d5",
-        strip_prefix = "hwloc-2.7.1",
-        build_file = "//third_party/hwloc:hwloc.BUILD",
-        system_build_file = "//third_party/hwloc:BUILD.system",
-    )
diff --git a/third_party/xla/third_party/implib_so/BUILD b/third_party/xla/third_party/implib_so/BUILD
deleted file mode 100644
index 8401d6152b88ab..00000000000000
--- a/third_party/xla/third_party/implib_so/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT
-
-py_binary(
-    name = "get_symbols",
-    srcs = ["get_symbols.py"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@bazel_tools//tools/python/runfiles",
-        "@implib_so//:implib_gen_lib",
-    ],
-)
-
-py_binary(
-    name = "make_stub",
-    srcs = ["make_stub.py"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@bazel_tools//tools/python/runfiles",
-        "@implib_so//:implib_gen_lib",
-    ],
-)
diff --git a/third_party/xla/third_party/implib_so/get_symbols.py b/third_party/xla/third_party/implib_so/get_symbols.py
deleted file mode 100644
index 9625052f7b69b6..00000000000000
--- a/third_party/xla/third_party/implib_so/get_symbols.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Given a .so file, lists symbols that should be included in a stub.
-
-Example usage:
-$ bazel run -c opt @local_tsl//third_party/implib_so:get_symbols
-/usr/local/cuda/lib64/libcudart.so > third_party/tsl/tsl/cuda/cudart.symbols
-"""
-
-import argparse
-import importlib
-
-# We can't import implib-gen directly because it has a dash in its name.
-implib = importlib.import_module('implib-gen')
-
-
-def _is_exported_function(s):
-  return (
-      s['Bind'] != 'LOCAL'
-      and s['Type'] == 'FUNC'
-      and s['Ndx'] != 'UND'
-      and s['Name'] not in ['', '_init', '_fini']
-      and s['Default']
-  )
-
-
-def main():
-  parser = argparse.ArgumentParser(
-      description='Extracts a list of symbols from a shared library'
-  )
-  parser.add_argument('library', help='Path to the .so file.')
-  args = parser.parse_args()
-  syms = implib.collect_syms(args.library)
-  funs = [s['Name'] for s in syms if _is_exported_function(s)]
-  for f in sorted(funs):
-    print(f)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/implib_so/implib_so.BUILD b/third_party/xla/third_party/implib_so/implib_so.BUILD
deleted file mode 100644
index bbfb2898eb12dd..00000000000000
--- a/third_party/xla/third_party/implib_so/implib_so.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Description:
-#   Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
-#   shared libraries.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT
-
-exports_files([
-    "LICENSE.txt",
-])
-
-py_library(
-    name = "implib_gen_lib",
-    srcs = ["implib-gen.py"],
-    data = glob([
-        "arch/**/*.S.tpl",
-        "arch/**/*.ini",
-    ]),
-)
diff --git a/third_party/xla/third_party/implib_so/make_stub.py b/third_party/xla/third_party/implib_so/make_stub.py
deleted file mode 100644
index f0e1fe564c0c17..00000000000000
--- a/third_party/xla/third_party/implib_so/make_stub.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Given a list of symbols, generates a stub."""
-
-import argparse
-import configparser
-import os
-import string
-
-from bazel_tools.tools.python.runfiles import runfiles
-
-r = runfiles.Create()
-
-
-def main():
-  parser = argparse.ArgumentParser(
-      description='Generates stubs for CUDA libraries.'
-  )
-  parser.add_argument('symbols', help='File containing a list of symbols.')
-  parser.add_argument(
-      '--outdir', '-o', help='Path to create wrapper at', default='.'
-  )
-  parser.add_argument(
-      '--target',
-      help='Target platform name, e.g. x86_64, aarch64.',
-      required=True,
-  )
-  args = parser.parse_args()
-
-  config_path = r.Rlocation(f'implib_so/arch/{args.target}/config.ini')
-  table_path = r.Rlocation(f'implib_so/arch/{args.target}/table.S.tpl')
-  trampoline_path = r.Rlocation(
-      f'implib_so/arch/{args.target}/trampoline.S.tpl'
-  )
-
-  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
-  cfg.read(config_path)
-  ptr_size = int(cfg['Arch']['PointerSize'])
-
-  with open(args.symbols, 'r') as f:
-    funs = [s.strip() for s in f.readlines()]
-
-  # Generate assembly code, containing a table for the resolved symbols and the
-  # trampolines.
-  lib_name, _ = os.path.splitext(os.path.basename(args.symbols))
-
-  with open(os.path.join(args.outdir, f'{lib_name}.tramp.S'), 'w') as f:
-    with open(table_path, 'r') as t:
-      table_text = string.Template(t.read()).substitute(
-          lib_suffix=lib_name, table_size=ptr_size * (len(funs) + 1)
-      )
-    f.write(table_text)
-
-    with open(trampoline_path, 'r') as t:
-      tramp_tpl = string.Template(t.read())
-
-    for i, name in enumerate(funs):
-      tramp_text = tramp_tpl.substitute(
-          lib_suffix=lib_name, sym=name, offset=i * ptr_size, number=i
-      )
-      f.write(tramp_text)
-
-  # Generates a list of symbols, formatted as a list of C++ strings.
-  with open(os.path.join(args.outdir, f'{lib_name}.inc'), 'w') as f:
-    sym_names = ''.join(f'  "{name}",\n' for name in funs)
-    f.write(sym_names)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/implib_so/workspace.bzl
deleted file mode 100644
index 01dad3b169f402..00000000000000
--- a/third_party/xla/third_party/implib_so/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
-shared libraries."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
-        build_file = "//third_party/implib_so:implib_so.BUILD",
-    )
diff --git a/third_party/xla/third_party/jpeg/BUILD b/third_party/xla/third_party/jpeg/BUILD
deleted file mode 100644
index ed1568c32f33ed..00000000000000
--- a/third_party/xla/third_party/jpeg/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Needed to make this a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/jpeg/BUILD.system b/third_party/xla/third_party/jpeg/BUILD.system
deleted file mode 100644
index f4f52da9bdae1b..00000000000000
--- a/third_party/xla/third_party/jpeg/BUILD.system
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # custom notice-style license, see LICENSE.md
-
-filegroup(
-    name = "LICENSE.md",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jpeg",
-    linkopts = ["-ljpeg"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/jpeg/jpeg.BUILD b/third_party/xla/third_party/jpeg/jpeg.BUILD
deleted file mode 100644
index 9f61f9e31e5e12..00000000000000
--- a/third_party/xla/third_party/jpeg/jpeg.BUILD
+++ /dev/null
@@ -1,806 +0,0 @@
-# Description:
-#   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
-
-licenses(["notice"])  # custom notice-style license, see LICENSE.md
-
-exports_files(["LICENSE.md"])
-
-WIN_COPTS = [
-    "/Ox",
-    "-DWITH_SIMD",
-    "-wd4996",
-]
-
-libjpegturbo_copts = select({
-    ":android": [
-        "-O3",
-        "-fPIC",
-        "-w",
-    ],
-    ":windows": WIN_COPTS,
-    "//conditions:default": [
-        "-O3",
-        "-w",
-    ],
-}) + select({
-    ":armeabi-v7a": [
-        "-D__ARM_NEON__",
-        "-DNEON_INTRINSICS",
-        "-march=armv7-a",
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-        "-fprefetch-loop-arrays",
-    ],
-    ":arm64-v8a": [
-        "-DNEON_INTRINSICS",
-    ],
-    ":linux_ppc64le": [
-        "-mcpu=power8",
-        "-mtune=power8",
-    ],
-    "//conditions:default": [],
-})
-
-cc_library(
-    name = "jpeg",
-    srcs = [
-        "jaricom.c",
-        "jcapimin.c",
-        "jcapistd.c",
-        "jcarith.c",
-        "jccoefct.c",
-        "jccolor.c",
-        "jcdctmgr.c",
-        "jchuff.c",
-        "jchuff.h",
-        "jcinit.c",
-        "jcmainct.c",
-        "jcmarker.c",
-        "jcmaster.c",
-        "jcomapi.c",
-        "jconfig.h",
-        "jconfigint.h",
-        "jcparam.c",
-        "jcphuff.c",
-        "jcprepct.c",
-        "jcsample.c",
-        "jctrans.c",
-        "jdapimin.c",
-        "jdapistd.c",
-        "jdarith.c",
-        "jdatadst.c",
-        "jdatasrc.c",
-        "jdcoefct.c",
-        "jdcoefct.h",
-        "jdcolor.c",
-        "jdct.h",
-        "jddctmgr.c",
-        "jdhuff.c",
-        "jdhuff.h",
-        "jdinput.c",
-        "jdmainct.c",
-        "jdmainct.h",
-        "jdmarker.c",
-        "jdmaster.c",
-        "jdmaster.h",
-        "jdmerge.c",
-        "jdmerge.h",
-        "jdphuff.c",
-        "jdpostct.c",
-        "jdsample.c",
-        "jdsample.h",
-        "jdtrans.c",
-        "jerror.c",
-        "jfdctflt.c",
-        "jfdctfst.c",
-        "jfdctint.c",
-        "jidctflt.c",
-        "jidctfst.c",
-        "jidctint.c",
-        "jidctred.c",
-        "jinclude.h",
-        "jmemmgr.c",
-        "jmemnobs.c",
-        "jmemsys.h",
-        "jpeg_nbits_table.h",
-        "jpegcomp.h",
-        "jquant1.c",
-        "jquant2.c",
-        "jutils.c",
-        "jversion.h",
-    ],
-    hdrs = [
-        "jccolext.c",  # should have been named .inc
-        "jdcol565.c",  # should have been named .inc
-        "jdcolext.c",  # should have been named .inc
-        "jdmrg565.c",  # should have been named .inc
-        "jdmrgext.c",  # should have been named .inc
-        "jerror.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jstdhuff.c",  # should have been named .inc
-    ],
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":nosimd": [":simd_none"],
-        ":k8": [":simd_x86_64"],
-        ":armeabi-v7a": [":simd_armv7a"],
-        ":arm64-v8a": [":simd_armv8a"],
-        ":linux_ppc64le": [":simd_altivec"],
-        ":windows": [":simd_win_x86_64"],
-        "//conditions:default": [":simd_none"],
-    }),
-)
-
-cc_library(
-    name = "simd_altivec",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/powerpc/jccolor-altivec.c",
-        "simd/powerpc/jcgray-altivec.c",
-        "simd/powerpc/jcsample-altivec.c",
-        "simd/powerpc/jdcolor-altivec.c",
-        "simd/powerpc/jdmerge-altivec.c",
-        "simd/powerpc/jdsample-altivec.c",
-        "simd/powerpc/jfdctfst-altivec.c",
-        "simd/powerpc/jfdctint-altivec.c",
-        "simd/powerpc/jidctfst-altivec.c",
-        "simd/powerpc/jidctint-altivec.c",
-        "simd/powerpc/jquanti-altivec.c",
-        "simd/powerpc/jsimd.c",
-    ],
-    hdrs = [
-        "simd/powerpc/jccolext-altivec.c",
-        "simd/powerpc/jcgryext-altivec.c",
-        "simd/powerpc/jcsample.h",
-        "simd/powerpc/jdcolext-altivec.c",
-        "simd/powerpc/jdmrgext-altivec.c",
-        "simd/powerpc/jsimd_altivec.h",
-    ],
-    copts = libjpegturbo_copts,
-)
-
-SRCS_SIMD_COMMON = [
-    "jchuff.h",
-    "jconfig.h",
-    "jconfigint.h",
-    "jdct.h",
-    "jerror.h",
-    "jinclude.h",
-    "jmorecfg.h",
-    "jpegint.h",
-    "jpeglib.h",
-    "jsimddct.h",
-    "jsimd.h",
-    "simd/jsimd.h",
-]
-
-cc_library(
-    name = "simd_x86_64",
-    srcs = [
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jsimdcpu.o",
-    ] + SRCS_SIMD_COMMON,
-    copts = libjpegturbo_copts,
-    linkstatic = 1,
-)
-
-genrule(
-    name = "simd_x86_64_assemblage23",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimdcpu.o",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -f elf64" +
-          "    -DELF -DPIC -D__x86_64__" +
-          "    -I $$(dirname $(location jconfig.h))/" +
-          "    -I $$(dirname $(location jconfigint.h))/" +
-          "    -I $$(dirname $(location simd/nasm/jsimdcfg.inc.h))/" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.o}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-expand_template(
-    name = "neon-compat_gen",
-    out = "simd/arm/neon-compat.h",
-    substitutions = {
-        "#cmakedefine HAVE_VLD1_S16_X3": "#define HAVE_VLD1_S16_X3",
-        "#cmakedefine HAVE_VLD1_U16_X2": "#define HAVE_VLD1_U16_X2",
-        "#cmakedefine HAVE_VLD1Q_U8_X4": "#define HAVE_VLD1Q_U8_X4",
-    },
-    template = "simd/arm/neon-compat.h.in",
-)
-
-genrule(
-    name = "neon-compat_hdr_src",
-    srcs = ["simd/arm/neon-compat.h"],
-    outs = ["neon-compat.h"],
-    cmd = "cp $(location simd/arm/neon-compat.h) $@",
-)
-
-cc_library(
-    name = "neon-compat_hdr",
-    hdrs = ["neon-compat.h"],
-    copts = libjpegturbo_copts,
-)
-
-SRCS_SIMD_ARM = [
-    "simd/arm/jccolor-neon.c",
-    "simd/arm/jcgray-neon.c",
-    "simd/arm/jcphuff-neon.c",
-    "simd/arm/jcsample-neon.c",
-    "simd/arm/jdcolor-neon.c",
-    "simd/arm/jdmerge-neon.c",
-    "simd/arm/jdsample-neon.c",
-    "simd/arm/jfdctfst-neon.c",
-    "simd/arm/jfdctint-neon.c",
-    "simd/arm/jidctfst-neon.c",
-    "simd/arm/jidctint-neon.c",
-    "simd/arm/jidctred-neon.c",
-    "simd/arm/jquanti-neon.c",
-]
-
-# .c files in the following list are used like .h files in that they are
-# "#include"-ed in the actual .c files. So, treat them like normal headers, and
-# they *should not* be compiled into individual objects.
-HDRS_SIMD_ARM = [
-    "simd/arm/align.h",
-    "simd/arm/jchuff.h",
-    "simd/arm/jcgryext-neon.c",
-    "simd/arm/jdcolext-neon.c",
-    "simd/arm/jdmrgext-neon.c",
-]
-
-cc_library(
-    name = "simd_armv7a",
-    srcs = [
-        "simd/arm/aarch32/jchuff-neon.c",
-        "simd/arm/aarch32/jsimd.c",
-    ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
-    hdrs = [
-        "simd/arm/aarch32/jccolext-neon.c",
-    ] + HDRS_SIMD_ARM,
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:private"],
-    deps = [":neon-compat_hdr"],
-)
-
-cc_library(
-    name = "simd_armv8a",
-    srcs = [
-        "simd/arm/aarch64/jchuff-neon.c",
-        "simd/arm/aarch64/jsimd.c",
-    ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
-    hdrs = [
-        "simd/arm/aarch64/jccolext-neon.c",
-    ] + HDRS_SIMD_ARM,
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:private"],
-    deps = [":neon-compat_hdr"],
-)
-
-cc_library(
-    name = "simd_win_x86_64",
-    srcs = [
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jsimdcpu.obj",
-    ] + SRCS_SIMD_COMMON,
-    copts = libjpegturbo_copts,
-)
-
-genrule(
-    name = "simd_win_x86_64_assemble",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimdcpu.obj",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -fwin64 -DWIN64 -D__x86_64__" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/../../win/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.obj}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-cc_library(
-    name = "simd_none",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimd_none.c",
-        "jsimddct.h",
-    ],
-    copts = libjpegturbo_copts,
-)
-
-expand_template(
-    name = "jversion",
-    out = "jversion.h",
-    substitutions = {
-        "@COPYRIGHT_YEAR@": "1991-2022",
-    },
-    template = "jversion.h.in",
-)
-
-expand_template(
-    name = "jconfig_win",
-    out = "jconfig_win.h",
-    substitutions = {
-        "@JPEG_LIB_VERSION@": "62",
-        "@VERSION@": "2.1.4",
-        "@LIBJPEG_TURBO_VERSION_NUMBER@": "2001004",
-        "@BITS_IN_JSAMPLE@": "8",
-        "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
-        "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
-        "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
-        "#cmakedefine WITH_SIMD": "",
-    },
-    template = "win/jconfig.h.in",
-)
-
-JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
-    "@JPEG_LIB_VERSION@": "62",
-    "@VERSION@": "2.1.4",
-    "@LIBJPEG_TURBO_VERSION_NUMBER@": "2001004",
-    "#cmakedefine C_ARITH_CODING_SUPPORTED 1": "#define C_ARITH_CODING_SUPPORTED 1",
-    "#cmakedefine D_ARITH_CODING_SUPPORTED 1": "#define D_ARITH_CODING_SUPPORTED 1",
-    "#cmakedefine MEM_SRCDST_SUPPORTED 1": "#define MEM_SRCDST_SUPPORTED 1",
-    "@BITS_IN_JSAMPLE@": "8",
-    "#cmakedefine HAVE_LOCALE_H 1": "#define HAVE_LOCALE_H 1",
-    "#cmakedefine HAVE_STDDEF_H 1": "#define HAVE_STDDEF_H 1",
-    "#cmakedefine HAVE_STDLIB_H 1": "#define HAVE_STDLIB_H 1",
-    "#cmakedefine NEED_SYS_TYPES_H 1": "#define NEED_SYS_TYPES_H 1",
-    "#cmakedefine NEED_BSD_STRINGS 1": "",
-    "#cmakedefine HAVE_UNSIGNED_CHAR 1": "#define HAVE_UNSIGNED_CHAR 1",
-    "#cmakedefine HAVE_UNSIGNED_SHORT 1": "#define HAVE_UNSIGNED_SHORT 1",
-    "#cmakedefine INCOMPLETE_TYPES_BROKEN 1": "",
-    "#cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1": "",
-    "#cmakedefine __CHAR_UNSIGNED__ 1": "",
-    "#undef const": "",
-    "#undef size_t": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD 1": "#define WITH_SIMD 1",
-}
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD 1": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-expand_template(
-    name = "jconfig_nowin_nosimd",
-    out = "jconfig_nowin_nosimd.h",
-    substitutions = JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS,
-    template = "jconfig.h.in",
-)
-
-expand_template(
-    name = "jconfig_nowin_simd",
-    out = "jconfig_nowin_simd.h",
-    substitutions = JCONFIG_NOWIN_SIMD_SUBSTITUTIONS,
-    template = "jconfig.h.in",
-)
-
-JCONFIGINT_COMMON_SUBSTITUTIONS = {
-    "@BUILD@": "20221022",
-    "@VERSION@": "2.1.4",
-    "@CMAKE_PROJECT_NAME@": "libjpeg-turbo",
-    "#undef inline": "",
-    "#cmakedefine HAVE_INTRIN_H": "",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "#define HAVE_BUILTIN_CTZL",
-    "@INLINE@": "inline __attribute__((always_inline))",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64 && !defined(__native_client__))\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_WIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "",
-    "#define INLINE  @INLINE@": "#if defined(__GNUC__)\n" +
-                                "#define INLINE inline __attribute__((always_inline))\n" +
-                                "#elif defined(_MSC_VER)\n" +
-                                "#define INLINE __forceinline\n" +
-                                "#else\n" +
-                                "#define INLINE\n" +
-                                "#endif\n",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64)\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-
-JCONFIGINT_WIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-
-expand_template(
-    name = "jconfigint_nowin",
-    out = "jconfigint_nowin.h",
-    substitutions = JCONFIGINT_NOWIN_SUBSTITUTIONS,
-    template = "jconfigint.h.in",
-)
-
-expand_template(
-    name = "jconfigint_win",
-    out = "jconfigint_win.h",
-    substitutions = JCONFIGINT_WIN_SUBSTITUTIONS,
-    template = "jconfigint.h.in",
-)
-
-genrule(
-    name = "configure",
-    srcs = [
-        "jconfig_win.h",
-        "jconfig_nowin_nosimd.h",
-        "jconfig_nowin_simd.h",
-    ],
-    outs = ["jconfig.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfig_win.h) $@",
-        ":k8": "cp $(location jconfig_nowin_simd.h) $@",
-        ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
-        "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
-    }),
-)
-
-genrule(
-    name = "configure_internal",
-    srcs = [
-        "jconfigint_win.h",
-        "jconfigint_nowin.h",
-    ],
-    outs = ["jconfigint.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfigint_win.h) $@",
-        "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
-    }),
-)
-
-# jiminy cricket the way this file is generated is completely outrageous
-genrule(
-    name = "configure_simd",
-    outs = ["simd/jsimdcfg.inc"],
-    cmd = "cat <<'EOF' >$@\n" +
-          "%define DCTSIZE 8\n" +
-          "%define DCTSIZE2 64\n" +
-          "%define RGB_RED 0\n" +
-          "%define RGB_GREEN 1\n" +
-          "%define RGB_BLUE 2\n" +
-          "%define RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGB_RED 0\n" +
-          "%define EXT_RGB_GREEN 1\n" +
-          "%define EXT_RGB_BLUE 2\n" +
-          "%define EXT_RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGBX_RED 0\n" +
-          "%define EXT_RGBX_GREEN 1\n" +
-          "%define EXT_RGBX_BLUE 2\n" +
-          "%define EXT_RGBX_PIXELSIZE 4\n" +
-          "%define EXT_BGR_RED 2\n" +
-          "%define EXT_BGR_GREEN 1\n" +
-          "%define EXT_BGR_BLUE 0\n" +
-          "%define EXT_BGR_PIXELSIZE 3\n" +
-          "%define EXT_BGRX_RED 2\n" +
-          "%define EXT_BGRX_GREEN 1\n" +
-          "%define EXT_BGRX_BLUE 0\n" +
-          "%define EXT_BGRX_PIXELSIZE 4\n" +
-          "%define EXT_XBGR_RED 3\n" +
-          "%define EXT_XBGR_GREEN 2\n" +
-          "%define EXT_XBGR_BLUE 1\n" +
-          "%define EXT_XBGR_PIXELSIZE 4\n" +
-          "%define EXT_XRGB_RED 1\n" +
-          "%define EXT_XRGB_GREEN 2\n" +
-          "%define EXT_XRGB_BLUE 3\n" +
-          "%define EXT_XRGB_PIXELSIZE 4\n" +
-          "%define RGBX_FILLER_0XFF 1\n" +
-          "%define JSAMPLE byte ; unsigned char\n" +
-          "%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)\n" +
-          "%define CENTERJSAMPLE 128\n" +
-          "%define JCOEF word ; short\n" +
-          "%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)\n" +
-          "%define JDIMENSION dword ; unsigned int\n" +
-          "%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)\n" +
-          "%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)\n" +
-          "%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)\n" +
-          "%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)\n" +
-          "%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)\n" +
-          "%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)\n" +
-          "%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)\n" +
-          "%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)\n" +
-          "%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)\n" +
-          "%define DCTELEM word ; short\n" +
-          "%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)\n" +
-          "%define float FP32 ; float\n" +
-          "%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)\n" +
-          "%define ISLOW_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)\n" +
-          "%define IFAST_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)\n" +
-          "%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors\n" +
-          "%define FLOAT_MULT_TYPE FP32 ; must be float\n" +
-          "%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)\n" +
-          "%define JSIMD_NONE 0x00\n" +
-          "%define JSIMD_MMX 0x01\n" +
-          "%define JSIMD_3DNOW 0x02\n" +
-          "%define JSIMD_SSE 0x04\n" +
-          "%define JSIMD_SSE2 0x08\n" +
-          "EOF",
-)
-
-string_flag(
-    name = "noasm",
-    build_setting_default = "no",
-)
-
-config_setting(
-    name = "nosimd",
-    flag_values = {":noasm": "yes"},
-)
-
-config_setting(
-    name = "k8",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "k8"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "armeabi-v7a"},
-)
-
-config_setting(
-    name = "arm64-v8a",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "arm64-v8a"},
-)
-
-config_setting(
-    name = "windows",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "x64_windows"},
-)
-
-config_setting(
-    name = "linux_ppc64le",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "ppc"},
-)
diff --git a/third_party/xla/third_party/jpeg/jpeg_helpers.BUILD.bazel b/third_party/xla/third_party/jpeg/jpeg_helpers.BUILD.bazel
deleted file mode 100644
index 5b01f6e3e4cfd1..00000000000000
--- a/third_party/xla/third_party/jpeg/jpeg_helpers.BUILD.bazel
+++ /dev/null
@@ -1 +0,0 @@
-licenses(["notice"])
diff --git a/third_party/xla/third_party/jpeg/workspace.bzl b/third_party/xla/third_party/jpeg/workspace.bzl
deleted file mode 100644
index 631cc933bc60d9..00000000000000
--- a/third_party/xla/third_party/jpeg/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""loads the jpeg library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "libjpeg_turbo",
-        urls = tf_mirror_urls("https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/2.1.4.tar.gz"),
-        sha256 = "a78b05c0d8427a90eb5b4eb08af25309770c8379592bb0b8a863373128e6143f",
-        strip_prefix = "libjpeg-turbo-2.1.4",
-        build_file = "//third_party/jpeg:jpeg.BUILD",
-        system_build_file = "//third_party/jpeg:BUILD.system",
-    )
diff --git a/third_party/xla/third_party/mkl/BUILD b/third_party/xla/third_party/mkl/BUILD
deleted file mode 100644
index fa6f51f2de7a7d..00000000000000
--- a/third_party/xla/third_party/mkl/BUILD
+++ /dev/null
@@ -1,77 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
-
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "build_with_mkl",
-    actual = "@local_tsl//tsl/mkl:build_with_mkl",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "build_with_mkl_lnx_x64",
-    actual = "@local_tsl//tsl/mkl:build_with_mkl_lnx_x64",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "build_with_mkl_lnx_openmp",
-    actual = "@local_tsl//tsl/mkl:build_with_mkl_lnx_openmp",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "build_with_mkl_windows_openmp",
-    actual = "@local_tsl//tsl/mkl:build_with_mkl_windows_openmp",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "build_with_mkl_aarch64",
-    actual = "@local_tsl//tsl/mkl:build_with_mkl_aarch64",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "enable_mkl",
-    actual = "@local_tsl//tsl/mkl:enable_mkl",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "intel_binary_blob",
-    actual = "@local_tsl//tsl/mkl:intel_binary_blob",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "LICENSE",
-    actual = "@local_tsl//tsl/mkl:LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "mkl_libs_linux",
-    actual = "@local_tsl//tsl/mkl:mkl_libs_linux",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "mkl_libs_darwin",
-    actual = "@local_tsl//tsl/mkl:mkl_libs_darwin",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "mkl_libs_windows",
-    actual = "@local_tsl//tsl/mkl:mkl_libs_windows",
-    visibility = ["//visibility:public"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/mkl/build_defs.bzl b/third_party/xla/third_party/mkl/build_defs.bzl
deleted file mode 100644
index 76bea5d8552b2e..00000000000000
--- a/third_party/xla/third_party/mkl/build_defs.bzl
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Starlark macros for MKL.
-
-if_mkl is a conditional to check if we are building with MKL.
-if_mkl_ml is a conditional to check if we are building with MKL-ML.
-if_mkl_ml_only is a conditional to check for MKL-ML-only (no MKL-DNN) mode.
-if_mkl_lnx_x64 is a conditional to check for MKL
-if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
-
-mkl_repository is a repository rule for creating MKL repository rule that can
-be pointed to either a local folder, or downloaded from the internet.
-mkl_repository depends on the following environment variables:
-  * `TF_MKL_ROOT`: The root folder where a copy of libmkl is located.
-"""
-
-load(
-    "@local_tsl//tsl/mkl:build_defs.bzl",
-    _if_enable_mkl = "if_enable_mkl",
-    _if_mkl = "if_mkl",
-    _if_mkl_lnx_x64 = "if_mkl_lnx_x64",
-    _if_mkl_ml = "if_mkl_ml",
-    _mkl_deps = "mkl_deps",
-    _mkl_repository = "mkl_repository",
-)
-
-if_mkl = _if_mkl
-if_mkl_ml = _if_mkl_ml
-if_mkl_lnx_x64 = _if_mkl_lnx_x64
-if_enable_mkl = _if_enable_mkl
-mkl_deps = _mkl_deps
-mkl_repository = _mkl_repository
diff --git a/third_party/xla/third_party/mkl_dnn/BUILD b/third_party/xla/third_party/mkl_dnn/BUILD
deleted file mode 100644
index c536923f794b07..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/BUILD
+++ /dev/null
@@ -1,52 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "build_with_mkl_opensource",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_opensource": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "build_with_mkldnn_openmp",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_openmp": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "build_with_mkl_aarch64_openmp",
-    define_values = {
-        "build_with_mkl_aarch64": "true",
-        "build_with_openmp": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "build_with_mkl_aarch64",
-    define_values = {
-        "build_with_mkl_aarch64": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/mkl_dnn/LICENSE b/third_party/xla/third_party/mkl_dnn/LICENSE
deleted file mode 100644
index 8dada3edaf50db..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/xla/third_party/mkl_dnn/build_defs.bzl b/third_party/xla/third_party/mkl_dnn/build_defs.bzl
deleted file mode 100644
index b0ed90ffab6b5b..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/build_defs.bzl
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Starlark macros for oneDNN.
-
-if_mkldnn_openmp checks if we are building x86 backend with OpenMP.
-if_mkldnn_aarch64_acl checks if we are building with Arm Compute Library.
-if_mkldnn_aarch64_acl_openmp checks if we are building ACL with OpenMP.
-"""
-
-def if_mkldnn_openmp(if_true, if_false = []):
-    """Returns `if_true` if OpenMP is used with oneDNN.
-
-    Shorthand for select()'ing on whether we're building with
-    oneDNN open source library only with openmp
-
-    Returns a select statement which evaluates to if_true if we're building
-    with oneDNN open source library only with OpenMP. Otherwise, the
-    select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_xla//third_party/mkl_dnn:build_with_mkldnn_openmp": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_mkldnn_aarch64_acl(if_true, if_false = []):
-    return select({
-        "@local_xla//third_party/mkl:build_with_mkl_aarch64": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_mkldnn_aarch64_acl_openmp(if_true, if_false = []):
-    return select({
-        "@local_xla//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": if_true,
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
deleted file mode 100644
index d453ee83240f68..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ /dev/null
@@ -1,181 +0,0 @@
-exports_files(["LICENSE"])
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-_DNNL_COPTS_THREADPOOL = [
-    "-fopenmp-simd",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
-_DNNL_COPTS_OMP = [
-    "-fopenmp",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
-_DNNL_RUNTIME_THREADPOOL = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-expand_template(
-    name = "dnnl_config_h",
-    out = "include/oneapi/dnnl/dnnl_config.h",
-    substitutions = select({
-        "@local_xla//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
-        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
-    }),
-    template = "include/oneapi/dnnl/dnnl_config.h.in",
-)
-
-expand_template(
-    name = "dnnl_version_h",
-    out = "include/oneapi/dnnl/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "2",
-        "@DNNL_VERSION_PATCH@": "1",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-    template = "include/oneapi/dnnl/dnnl_version.h.in",
-)
-
-cc_library(
-    name = "mkl_dnn_acl",
-    srcs = glob(
-        [
-            "src/common/*.cpp",
-            "src/cpu/**/*.cpp",
-            "src/cpu/*.cpp",
-        ],
-        exclude = [
-            "src/cpu/x64/**",
-            "src/cpu/rv64/**",
-        ],
-    ),
-    copts = select({
-        "@local_xla//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
-        "//conditions:default": _DNNL_COPTS_THREADPOOL,
-    }),
-    defines = ["DNNL_AARCH64_USE_ACL=1"],
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/aarch64/xbyak_aarch64/src",
-        "src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64",
-        "src/cpu/gemm",
-    ],
-    textual_hdrs = glob(
-        [
-            "include/**/*",
-            "include/*",
-            "src/common/*.hpp",
-            "src/cpu/**/*.hpp",
-            "src/cpu/*.hpp",
-            "src/cpu/aarch64/xbyak_aarch64/**/*.h",
-        ],
-    ) + [
-        ":dnnl_config_h",
-        ":dnnl_version_h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@compute_library//:arm_compute",
-    ],
-)
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
deleted file mode 100644
index b19a9a5181ee28..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ /dev/null
@@ -1,185 +0,0 @@
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
-load("@local_xla//third_party/mkl:build_defs.bzl", "if_mkl")
-load("@local_xla//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
-load("@local_xla//third_party/mkl:build_defs.bzl", "if_mkl_ml")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-exports_files(["LICENSE"])
-
-_CMAKE_COMMON_LIST = {
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
-    "#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
-    "#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 1",
-    "#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 1",
-    "#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 1",
-    "#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 1",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 1",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-}
-
-_DNNL_RUNTIME_OMP.update(_CMAKE_COMMON_LIST)
-
-_DNNL_RUNTIME_THREADPOOL = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-}
-
-_DNNL_RUNTIME_THREADPOOL.update(_CMAKE_COMMON_LIST)
-
-expand_template(
-    name = "dnnl_config_h",
-    out = "include/oneapi/dnnl/dnnl_config.h",
-    substitutions = select({
-        "@local_xla//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
-        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
-    }),
-    template = "include/oneapi/dnnl/dnnl_config.h.in",
-)
-
-# Create the file dnnl_version.h with DNNL version numbers.
-# Currently, the version numbers are hard coded here. If DNNL is upgraded then
-# the version numbers have to be updated manually. The version numbers can be
-# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
-# set to "version_major.version_minor.version_patch". The git hash version can
-# be set to NA.
-# TODO(agramesh1): Automatically get the version numbers from CMakeLists.txt.
-expand_template(
-    name = "dnnl_version_h",
-    out = "include/oneapi/dnnl/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-    template = "include/oneapi/dnnl/dnnl_version.h.in",
-)
-
-_COPTS_LIST = select({
-    "@local_tsl//tsl:windows": [],
-    "//conditions:default": ["-fexceptions"],
-}) + [
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-    "-DDNNL_ENABLE_MAX_CPU_ISA",
-    "-DDNNL_ENABLE_ITT_TASKS",
-] + tf_openmp_copts()
-
-_INCLUDES_LIST = [
-    "include",
-    "src",
-    "src/common",
-    "src/common/ittnotify",
-    "src/cpu",
-    "src/cpu/gemm",
-    "src/cpu/x64/xbyak",
-]
-
-_TEXTUAL_HDRS_LIST = glob([
-    "include/**/*",
-    "src/common/*.hpp",
-    "src/common/ittnotify/**/*.h",
-    "src/cpu/*.hpp",
-    "src/cpu/**/*.hpp",
-    "src/cpu/jit_utils/**/*.hpp",
-    "src/cpu/x64/xbyak/*.h",
-]) + [
-    ":dnnl_config_h",
-    ":dnnl_version_h",
-]
-
-# Large autogen files take too long time to compile with usual optimization
-# flags. These files just generate binary kernels and are not the hot spots,
-# so we factor them out to lower compiler optimizations in ":dnnl_autogen".
-# Using -O1 to enable optimizations to reduce stack consumption. (With -O0,
-# compiler doesn't clean up stack from temporary objects.)
-cc_library(
-    name = "onednn_autogen",
-    srcs = glob(["src/cpu/x64/gemm/**/*_kern_autogen*.cpp"]),
-    copts = [
-        "-O1",
-        "-U_FORTIFY_SOURCE",
-    ] + _COPTS_LIST,
-    includes = _INCLUDES_LIST,
-    textual_hdrs = _TEXTUAL_HDRS_LIST,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob(
-        [
-            "src/common/*.cpp",
-            "src/cpu/*.cpp",
-            "src/cpu/**/*.cpp",
-            "src/common/ittnotify/*.c",
-            "src/cpu/jit_utils/**/*.cpp",
-        ],
-        exclude = [
-            "src/cpu/aarch64/**",
-            "src/cpu/rv64/**",
-            "src/cpu/x64/gemm/**/*_kern_autogen.cpp",
-        ],
-    ),
-    copts = _COPTS_LIST,
-    includes = _INCLUDES_LIST,
-    # TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
-    linkopts = select({
-        "@local_tsl//tsl:linux_aarch64": ["-lrt"],
-        "@local_tsl//tsl:linux_x86_64": ["-lrt"],
-        "@local_tsl//tsl:linux_ppc64le": ["-lrt"],
-        "//conditions:default": [],
-    }),
-    textual_hdrs = _TEXTUAL_HDRS_LIST,
-    visibility = ["//visibility:public"],
-    deps = [":onednn_autogen"] + if_mkl_ml(
-        ["@local_xla//third_party/mkl:intel_binary_blob"],
-        [],
-    ),
-)
diff --git a/third_party/xla/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch b/third_party/xla/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
deleted file mode 100644
index 6d6f0c0eaabb13..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
-From: Kentaro Kawakami <kawakami.k@fujitsu.com>
-Date: Fri, 26 May 2023 10:58:36 +0900
-Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
- Ubuntu 20.04
-
----
- .../aarch64/xbyak_aarch64/src/util_impl_linux.h   | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-index 743843bae50..3db37e972d1 100644
---- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-@@ -39,6 +39,13 @@
- #include <asm/hwcap.h>
- #endif
- 
-+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
-+#ifdef AT_HWCAP2
-+#ifndef HWCAP2_BF16
-+#define HWCAP2_BF16 (1UL << 14)
-+#endif
-+#endif
-+
- namespace Xbyak_aarch64 {
- namespace util {
- #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
-@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
-   }
- 
-   void setHwCap() {
--    unsigned long hwcap = getauxval(AT_HWCAP);
-+    const unsigned long hwcap = getauxval(AT_HWCAP);
-     if (hwcap & HWCAP_ATOMICS)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
- 
-@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
-     if (hwcap & HWCAP_ASIMD)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
--#ifdef HWCAP2_BF16
--    if (hwcap & HWCAP2_BF16)
-+
-+#ifdef AT_HWCAP2
-+    const unsigned long hwcap2 = getauxval(AT_HWCAP2);
-+    if (hwcap2 & HWCAP2_BF16)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
- #endif
- 
diff --git a/third_party/xla/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch b/third_party/xla/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
deleted file mode 100644
index 202902a1894a86..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
+++ /dev/null
@@ -1,111 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
-index 4a43b24c5..1a5cfe590 100644
---- a/src/cpu/aarch64/cpu_isa_traits.hpp
-+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
-     return cpu().isAtomicSupported();
- }
- 
--inline bool isa_has_bf16(cpu_isa_t isa) {
--    return false;
-+static inline bool mayiuse_bf16() {
-+    using namespace Xbyak_aarch64::util;
-+    return cpu().isBf16Supported();
- }
--
- } // namespace
- 
- /* whatever is required to generate string literals... */
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index 6bd259ec2..5541bb702 100644
---- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,7 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
--* Copyright 2022 Arm Ltd. and affiliates
-+* Copyright 2022-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         bool ok = true && p.ndims > 0
-                 && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
--                && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-+                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
--                && simple_impl_desc_init(p, nullptr)
--                && prb_has_small_strides(p);
-+                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-+                && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
- 
-         return ok;
-     }
-@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         cvt_v_s32_u8(startIdx, regNum);
-                     if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
-                     break;
-+                case bf16:
-+                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
-+                    break;
-                 default: assert(!"unreachable");
-             }
-         };
-@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
-     }
- 
-+    void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
-+        UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
-+    }
-+
-     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
-         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-index ba5499ba9..d4e21d316 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-+
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
diff --git a/third_party/xla/third_party/mkl_dnn/onednn_acl_reorder.patch b/third_party/xla/third_party/mkl_dnn/onednn_acl_reorder.patch
deleted file mode 100644
index 5da6756c70a275..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_reorder.patch
+++ /dev/null
@@ -1,371 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
-new file mode 100644
-index 000000000..061751b55
---- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.cpp
-@@ -0,0 +1,52 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#include "cpu/aarch64/acl_reorder.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-+    // Lock here is needed because resource_mapper does not support
-+    // concurrent multithreaded access.
-+    std::lock_guard<std::mutex> _lock {this->mtx};
-+
-+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
-+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
-+
-+    // Retrieve primitive resource and configured Compute Library objects
-+    auto *acl_resource
-+            = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
-+
-+    acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
-+
-+    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-+    acl_obj.dst_tensor.allocator()->import_memory(dst);
-+
-+    acl_obj.reorder.run();
-+
-+    acl_obj.src_tensor.allocator()->free();
-+    acl_obj.dst_tensor.allocator()->free();
-+
-+    return status::success;
-+}
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
-new file mode 100644
-index 0000000000..edbc38914d
---- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.hpp
-@@ -0,0 +1,262 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+#ifndef CPU_AARCH64_ACL_REORDER_HPP
-+#define CPU_AARCH64_ACL_REORDER_HPP
-+
-+#include "cpu/aarch64/acl_utils.hpp"
-+#include "cpu/reorder/cpu_reorder_pd.hpp"
-+#include "arm_compute/core/Types.h"
-+#include "common/utils.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+struct acl_reorder_obj_t {
-+    arm_compute::NEReorderLayer reorder;
-+    arm_compute::Tensor src_tensor;
-+    arm_compute::Tensor dst_tensor;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_conf_t {
-+    arm_compute::TensorInfo src_info;
-+    arm_compute::TensorInfo dst_info;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_resource_t : public resource_t {
-+    acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
-+
-+    status_t configure(const acl_reorder_conf_t &app) {
-+        if (!acl_obj_) return status::out_of_memory;
-+
-+        // Init Compute Library tensors based on info from descriptor
-+        acl_obj_->src_tensor.allocator()->init(app.src_info);
-+        acl_obj_->dst_tensor.allocator()->init(app.dst_info);
-+
-+        // clang-format off
-+        acl_obj_->reorder.configure(
-+            &acl_obj_->src_tensor,
-+            &acl_obj_->dst_tensor,
-+            app.src_wf,
-+            app.dst_wf
-+            );
-+        // clang-format on
-+
-+        return status::success;
-+    }
-+
-+    acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
-+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
-+
-+private:
-+    std::unique_ptr<acl_reorder_obj_t> acl_obj_;
-+}; // acl_reorder_resource_t
-+
-+struct acl_reorder_fwd_t : public primitive_t {
-+    using primitive_t::primitive_t;
-+    struct pd_t : public cpu_reorder_pd_t {
-+
-+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
-+
-+        DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
-+
-+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-+                const primitive_attr_t *attr, engine_t *src_engine,
-+                const memory_desc_t *src_md, engine_t *dst_engine,
-+                const memory_desc_t *dst_md) {
-+
-+            using namespace acl_utils;
-+            // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-+
-+            bool ok = src_md->data_type
-+                            == dst_md->data_type // ACL only supports matching src/dst data types
-+                    && utils::one_of(src_md->data_type,
-+                            data_type::f32) // Only supports f32 for now
-+                    && attr->has_default_values();
-+            if (!ok) return status::unimplemented;
-+
-+            int mask = -1;
-+            bool is_set = false;
-+            // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
-+            const memory_desc_wrapper input_d(src_md);
-+            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-+                return status::unimplemented;
-+
-+            // Create and check primitive descriptor
-+            auto _pd = new pd_t(attr, src_engine->kind(), src_md,
-+                    dst_engine->kind(), dst_md);
-+            if (_pd == nullptr) return status::out_of_memory;
-+            if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-+                delete _pd;
-+                return status::unimplemented;
-+            }
-+
-+            const memory_desc_wrapper src_d(*src_md);
-+            const memory_desc_wrapper dst_d(*dst_md);
-+
-+            const int ndims = src_d.ndims();
-+
-+            auto src_tag = memory_desc_matches_one_of_tag(
-+                            *src_md, format_tag::ba, format_tag::cdba);
-+            ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, src_tag),
-+                            "");
-+
-+            arm_compute::TensorShape acl_tensor_shape_in;
-+            arm_compute::TensorShape acl_tensor_shape_out;
-+            // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
-+            int dim_0_rounded_up;
-+
-+            // Switch for 2 or 4 dim tensors
-+            switch(ndims)
-+            {
-+                // Currently for Ab4a and Ab8a
-+                // No format_tag for these, have to deduce from stride
-+                case 2:
-+                    {
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        int dst_dim_1 = dst_md->dims[1];
-+                        int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
-+                        int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
-+                        // Interleave of 4 or 8 that stride for dim 1
-+                        if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
-+                            return status::unimplemented;
-+                        }
-+                        // Check to ensure it's a blocking transpose
-+                        if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_dim_1_stride == 4){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        } else {
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
-+
-+                        break;
-+                    }
-+                // Currently for Acdb4a and Acdb8a
-+                case 4:
-+                    { 
-+
-+                        auto dst_tag = memory_desc_matches_one_of_tag(
-+                            *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
-+                        ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, dst_tag),
-+                            "");
-+                        if(dst_tag == format_tag::Acdb4a){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        }
-+                        else{
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        // Currently only supporting AxBx1x1 cases
-+                        if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
-+                        break;
-+                    }
-+                default:
-+                    return status::unimplemented;
-+            }
-+
-+            // Choose the data layout
-+            // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
-+            const auto acl_layout = arm_compute::DataLayout::NCHW;
-+
-+            // Set Source WeightFormat
-+            _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
-+
-+            // Create ACL tensor infos
-+            const data_type_t data_type = src_d.data_type();
-+            const arm_compute::DataType acl_data_t
-+                    = acl_utils::get_acl_data_t(data_type);
-+            _pd->app_.src_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_in, 1, acl_data_t, acl_layout);
-+            _pd->app_.dst_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_out, 1, acl_data_t, acl_layout);
-+
-+            // Init scratch memory, not used so 0 in this implementation
-+            _pd->init_scratchpad_md();
-+
-+            return safe_ptr_assign(*reorder_pd, _pd);
-+        } // create 
-+
-+        friend dnnl::impl::impl_list_item_t;
-+        acl_reorder_conf_t app_;
-+
-+    }; // pd_t
-+
-+    acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-+
-+    status_t create_resource(
-+            engine_t *engine, resource_mapper_t &mapper) const override {
-+        if (mapper.has_resource(this)) return status::success;
-+
-+        auto r = utils::make_unique<acl_reorder_resource_t>();
-+        if (!r) return status::out_of_memory;
-+
-+        // Configure the resource based on information from primitive descriptor
-+        CHECK(r->configure(pd()->app_));
-+
-+        mapper.add(this, std::move(r));
-+        return status::success;
-+    }
-+
-+    status_t execute(const exec_ctx_t &ctx) const override {
-+        return execute_forward(ctx);
-+    }
-+
-+private:
-+    // To guard the const execute_forward, the mutex must be 'mutable'
-+    mutable std::mutex mtx;
-+    status_t execute_forward(const exec_ctx_t &ctx) const;
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+
-+
-+}; // acl_reorder_fwd_t
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-+
-+#endif // CPU_AARCH64_ACL_REORDER_HPP
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index a4150b619..f4d6b4de3 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -16,6 +16,7 @@
- *******************************************************************************/
- 
- #include "cpu/reorder/cpu_reorder.hpp"
-+#include "cpu/aarch64/acl_reorder.hpp"
- 
- namespace dnnl {
- namespace impl {
-@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-         // f32 -> f32
-         {{f32, f32, 0}, {
-             REG_FAST_DIRECT_COPY_F32_F32
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
- 
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             nullptr,
-         }},
-         {{f32, f32, 4}, {
-+
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-             CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
- 
-             REG_FAST_DIRECT_COPY_F32_F32
diff --git a/third_party/xla/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch b/third_party/xla/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
deleted file mode 100644
index 9583308396dd1d..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
+++ /dev/null
@@ -1,97 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..bd7bed837 100644
---- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
- #endif
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--void acl_set_tp_scheduler() {
--    static std::once_flag flag_once;
--    // Create threadpool scheduler
--    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
--            = std::make_unique<ThreadpoolScheduler>();
-+void acl_set_tp_scheduler(int intra_threads = 0) {
-+    static thread_local std::once_flag flag_once;
-     // set CUSTOM scheduler in ACL
-     std::call_once(flag_once,
--            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
-+            [&]() {
-+                    // Create threadpool scheduler
-+                    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-+                        = std::make_unique<ThreadpoolScheduler>();
-+                    threadpool_scheduler->set_num_threads(intra_threads);
-+
-+                    arm_compute::Scheduler::set(threadpool_scheduler); });
- }
- 
- void acl_set_threadpool_num_threads() {
-@@ -102,14 +105,6 @@ void set_acl_threading() {
-         acl_set_benchmark_scheduler_default();
-     }
- #endif
--#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--    if (verbose_has_profile_externals()) {
--        acl_set_tp_benchmark_scheduler();
--    } else {
--        acl_set_tp_scheduler();
--    }
--
--#endif
- }
- 
- } // namespace acl_thread_utils
-diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
-index f073376e6..654a2aa5d 100644
---- a/src/cpu/aarch64/acl_thread.hpp
-+++ b/src/cpu/aarch64/acl_thread.hpp
-@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
--void acl_set_tp_scheduler();
-+void acl_set_tp_scheduler(int intra_threads);
- void acl_set_threadpool_num_threads();
- // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
- void acl_set_tp_benchmark_scheduler();
-diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 439ca862e..6656c37a5 100644
---- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
- void ThreadpoolScheduler::run_workloads(
-         std::vector<arm_compute::IScheduler::Workload> &workloads) {
- 
--    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
--
-     const unsigned int num_threads
-             = std::min(static_cast<unsigned int>(_num_threads),
-                     static_cast<unsigned int>(workloads.size()));
-diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
-index 0bfec3871..7207b2b60 100644
---- a/src/cpu/cpu_engine.cpp
-+++ b/src/cpu/cpu_engine.cpp
-@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
- #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
- status_t cpu_engine_t::create_stream(stream_t **stream,
-         dnnl::threadpool_interop::threadpool_iface *threadpool) {
-+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
-     return safe_ptr_assign<stream_t>(
-             *stream, new cpu_stream_t(this, threadpool));
- }
diff --git a/third_party/xla/third_party/mkl_dnn/onednn_acl_threadcap.patch b/third_party/xla/third_party/mkl_dnn/onednn_acl_threadcap.patch
deleted file mode 100644
index 3a33af153e917c..00000000000000
--- a/third_party/xla/third_party/mkl_dnn/onednn_acl_threadcap.patch
+++ /dev/null
@@ -1,43 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..2d7c76d48 100644
---- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -17,6 +17,8 @@
- #include "cpu/aarch64/acl_thread.hpp"
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- #include "cpu/aarch64/acl_threadpool_scheduler.hpp"
-+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-+#include <thread>
- #endif
- #include "cpu/aarch64/acl_benchmark_scheduler.hpp"
- 
-@@ -30,9 +32,10 @@ namespace acl_thread_utils {
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
- void acl_thread_bind() {
-     static std::once_flag flag_once;
--    // The threads in Compute Library are bound for the cores 0..max_threads-1
--    // dnnl_get_max_threads() returns OMP_NUM_THREADS
--    const int max_threads = dnnl_get_max_threads();
-+    // Cap the number of threads to 90% of the total core count
-+    // to ensure Compute Library doesn't use too much resource
-+    int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
-+    const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
-     // arm_compute::Scheduler does not support concurrent access thus a
-     // workaround here restricts it to only one call
-     std::call_once(flag_once, [&]() {
diff --git a/third_party/xla/third_party/nasm/BUILD b/third_party/xla/third_party/nasm/BUILD
deleted file mode 100644
index ed1568c32f33ed..00000000000000
--- a/third_party/xla/third_party/nasm/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Needed to make this a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/nasm/BUILD.system b/third_party/xla/third_party/nasm/BUILD.system
deleted file mode 100644
index 52f608187fef3b..00000000000000
--- a/third_party/xla/third_party/nasm/BUILD.system
+++ /dev/null
@@ -1,18 +0,0 @@
-licenses(["notice"])  # BSD 2-clause
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "lnnasmlink",
-    outs = ["nasmlink"],
-    cmd = "ln -s $$(which nasm) $@",
-)
-
-sh_binary(
-    name = "nasm",
-    srcs = ["nasmlink"],
-    visibility = ["@libjpeg_turbo//:__pkg__"],
-)
diff --git a/third_party/xla/third_party/nasm/config.h b/third_party/xla/third_party/nasm/config.h
deleted file mode 100644
index 3533280c472d90..00000000000000
--- a/third_party/xla/third_party/nasm/config.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/* config/config.h.  Generated from config.h.in by configure.  */
-/* config/config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to 1 to call abort() on panics (internal errors), for debugging. */
-/* #undef ABORT_ON_PANIC */
-
-/* Define if building universal (internal helper macro) */
-/* #undef AC_APPLE_UNIVERSAL_BUILD */
-
-/* Define to 1 if compiled with the `-fdata-sections' compiler flag */
-/* #undef CFLAG_FDATA_SECTIONS */
-
-/* Define to 1 if compiled with the `-ffunction-sections' compiler flag */
-/* #undef CFLAG_FFUNCTION_SECTIONS */
-
-/* Define to 1 if compiled with the `-fgnu89-inline' compiler flag */
-/* #undef CFLAG_FGNU89_INLINE */
-
-/* Define to 1 if compiled with the `-flto' compiler flag */
-/* #undef CFLAG_FLTO */
-
-/* Define to 1 if compiled with the `-fno-common' compiler flag */
-#define CFLAG_FNO_COMMON 1
-
-/* Define to 1 if compiled with the `-fno-omit-frame-pointer' compiler flag */
-/* #undef CFLAG_FNO_OMIT_FRAME_POINTER */
-
-/* Define to 1 if compiled with the `-fsanitize=address' compiler flag */
-/* #undef CFLAG_FSANITIZE_ADDRESS */
-
-/* Define to 1 if compiled with the `-fsanitize=undefined' compiler flag */
-/* #undef CFLAG_FSANITIZE_UNDEFINED */
-
-/* Define to 1 if compiled with the `-fvisibility=hidden' compiler flag */
-#define CFLAG_FVISIBILITY_HIDDEN 1
-
-/* Define to 1 if compiled with the `-fwrapv' compiler flag */
-#define CFLAG_FWRAPV 1
-
-/* Define to 1 if compiled with the `-ggdb3' compiler flag */
-/* #undef CFLAG_GGDB3 */
-
-/* Define to 1 if compiled with the `-pedantic' compiler flag */
-#define CFLAG_PEDANTIC 1
-
-/* Define to 1 if compiled with the `-U__STRICT_ANSI__' compiler flag */
-#define CFLAG_U_STRICT_ANSI 1
-
-/* Define to 1 if compiled with the `-W' compiler flag */
-#define CFLAG_W 1
-
-/* Define to 1 if compiled with the `-Wall' compiler flag */
-#define CFLAG_WALL 1
-
-/* Define to 1 if compiled with the `-Wc90-c99-compat' compiler flag */
-/* #undef CFLAG_WC90_C99_COMPAT */
-
-/* Define to 1 if compiled with the `-Werror' compiler flag */
-/* #undef CFLAG_WERROR */
-
-/* Define to 1 if compiled with the `-Werror=attributes' compiler flag */
-#define CFLAG_WERROR_ATTRIBUTES 1
-
-/* Define to 1 if compiled with the `-Werror=comment' compiler flag */
-#define CFLAG_WERROR_COMMENT 1
-
-/* Define to 1 if compiled with the `-Werror=implicit' compiler flag */
-#define CFLAG_WERROR_IMPLICIT 1
-
-/* Define to 1 if compiled with the `-Werror=missing-braces' compiler flag */
-#define CFLAG_WERROR_MISSING_BRACES 1
-
-/* Define to 1 if compiled with the `-Werror=missing-declarations' compiler
-   flag */
-#define CFLAG_WERROR_MISSING_DECLARATIONS 1
-
-/* Define to 1 if compiled with the `-Werror=missing-prototypes' compiler flag
- */
-#define CFLAG_WERROR_MISSING_PROTOTYPES 1
-
-/* Define to 1 if compiled with the `-Werror=pointer-arith' compiler flag */
-#define CFLAG_WERROR_POINTER_ARITH 1
-
-/* Define to 1 if compiled with the `-Werror=return-type' compiler flag */
-#define CFLAG_WERROR_RETURN_TYPE 1
-
-/* Define to 1 if compiled with the `-Werror=strict-prototypes' compiler flag
- */
-/* #undef CFLAG_WERROR_STRICT_PROTOTYPES */
-
-/* Define to 1 if compiled with the `-Werror=trigraphs' compiler flag */
-#define CFLAG_WERROR_TRIGRAPHS 1
-
-/* Define to 1 if compiled with the `-Werror=unknown-warning-option' compiler
-   flag */
-/* #undef CFLAG_WERROR_UNKNOWN_WARNING_OPTION */
-
-/* Define to 1 if compiled with the `-Werror=vla' compiler flag */
-#define CFLAG_WERROR_VLA 1
-
-/* Define to 1 if compiled with the `-Wlong-long' compiler flag */
-#define CFLAG_WLONG_LONG 1
-
-/* Define to 1 if compiled with the `-Wl,--gc-sections' compiler flag */
-/* #undef CFLAG_WL_GC_SECTIONS */
-
-/* Define to 1 if compiled with the `-Wpedantic-ms-format' compiler flag */
-/* #undef CFLAG_WPEDANTIC_MS_FORMAT */
-
-/* Define to 1 if compiled with the `-Wshift-negative-value' compiler flag */
-#define CFLAG_WSHIFT_NEGATIVE_VALUE 1
-
-/* Define to 1 if compiled with the `-Wstringop-truncation' compiler flag */
-/* #undef CFLAG_WSTRINGOP_TRUNCATION */
-
-/* Define to 1 if you have the `access' function. */
-#define HAVE_ACCESS 1
-
-/* Define to 1 if you have the `canonicalize_file_name' function. */
-/* #undef HAVE_CANONICALIZE_FILE_NAME */
-
-/* Define to 1 if you have the `cpu_to_le16' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE16 */
-
-/* Define to 1 if you have the `cpu_to_le32' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE32 */
-
-/* Define to 1 if you have the `cpu_to_le64' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE64 */
-
-/* Define to 1 if you have the declaration of `strcasecmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRCASECMP 1
-
-/* Define to 1 if you have the declaration of `stricmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRICMP 0
-
-/* Define to 1 if you have the declaration of `strlcpy', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRLCPY 0
-
-/* Define to 1 if you have the declaration of `strncasecmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNCASECMP 1
-
-/* Define to 1 if you have the declaration of `strnicmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNICMP 0
-
-/* Define to 1 if you have the declaration of `strnlen', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNLEN 1
-
-/* Define to 1 if you have the declaration of `strrchrnul', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRRCHRNUL 0
-
-/* Define to 1 if you have the declaration of `strsep', and to 0 if you don't.
- */
-#define HAVE_DECL_STRSEP 1
-
-/* Define to 1 if you have the <endian.h> header file. */
-/* #undef HAVE_ENDIAN_H */
-
-/* Define to 1 if you have the `faccessat' function. */
-#define HAVE_FACCESSAT 1
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#define HAVE_FCNTL_H 1
-
-/* Define to 1 if you have the `fileno' function. */
-#define HAVE_FILENO 1
-
-/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#define HAVE_FSEEKO 1
-
-/* Define to 1 if you have the `fstat' function. */
-#define HAVE_FSTAT 1
-
-/* Define to 1 if you have the `ftruncate' function. */
-#define HAVE_FTRUNCATE 1
-
-/* Define to 1 if your compiler supports __attribute__((alloc_size)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_ALLOC_SIZE 1
-
-/* Define to 1 if your compiler supports __attribute__((cold)) on functions */
-#define HAVE_FUNC_ATTRIBUTE_COLD 1
-
-/* Define to 1 if your compiler supports __attribute__((const)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_CONST 1
-
-/* Define to 1 if your compiler supports __attribute__((error)) on functions
- */
-/* #undef HAVE_FUNC_ATTRIBUTE_ERROR */
-
-/* Define to 1 if your compiler supports __attribute__((format)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_FORMAT 1
-
-/* Define to 1 if your compiler supports __attribute__((malloc)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_MALLOC 1
-
-/* Define to 1 if your compiler supports __attribute__((noreturn)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_NORETURN 1
-
-/* Define to 1 if your compiler supports __attribute__((pure)) on functions */
-#define HAVE_FUNC_ATTRIBUTE_PURE 1
-
-/* Define to 1 if your compiler supports __attribute__((returns_nonnull)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 1
-
-/* Define to 1 if your compiler supports __attribute__((sentinel)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_SENTINEL 1
-
-/* Define to 1 if you have the `getgid' function. */
-#define HAVE_GETGID 1
-
-/* Define to 1 if you have the `getpagesize' function. */
-#define HAVE_GETPAGESIZE 1
-
-/* Define to 1 if you have the `getuid' function. */
-#define HAVE_GETUID 1
-
-/* Define to 1 if you have the `htole16' intrinsic function. */
-/* #undef HAVE_HTOLE16 */
-
-/* Define to 1 if you have the `htole32' intrinsic function. */
-/* #undef HAVE_HTOLE32 */
-
-/* Define to 1 if you have the `htole64' intrinsic function. */
-/* #undef HAVE_HTOLE64 */
-
-/* Define to 1 if you have the <intrin.h> header file. */
-/* #undef HAVE_INTRIN_H */
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <io.h> header file. */
-/* #undef HAVE_IO_H */
-
-/* Define to 1 if you have the <machine/endian.h> header file. */
-/* #undef HAVE_MACHINE_ENDIAN_H */
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have a working `mmap' system call. */
-#define HAVE_MMAP 1
-
-/* Define to 1 if you have the `pathconf' function. */
-#define HAVE_PATHCONF 1
-
-/* Define to 1 if you have the `realpath' function. */
-#define HAVE_REALPATH 1
-
-/* Define to 1 if you have the `snprintf' function. */
-#define HAVE_SNPRINTF 1
-
-/* Define to 1 if you have the `stat' function. */
-#define HAVE_STAT 1
-
-/* Define to 1 if stdbool.h conforms to C99. */
-#define HAVE_STDBOOL_H 1
-
-/* Define to 1 if your compiler supports C99 extern inline */
-#define HAVE_STDC_INLINE 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <stdnoreturn.h> header file. */
-#define HAVE_STDNORETURN_H 1
-
-/* Define to 1 if you have the `strcasecmp' function. */
-#define HAVE_STRCASECMP 1
-
-/* Define to 1 if you have the `stricmp' function. */
-/* #undef HAVE_STRICMP */
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the `strlcpy' function. */
-/* #undef HAVE_STRLCPY */
-
-/* Define to 1 if you have the `strncasecmp' function. */
-#define HAVE_STRNCASECMP 1
-
-/* Define to 1 if you have the `strnicmp' function. */
-/* #undef HAVE_STRNICMP */
-
-/* Define to 1 if you have the `strnlen' function. */
-#define HAVE_STRNLEN 1
-
-/* Define to 1 if you have the `strrchrnul' function. */
-/* #undef HAVE_STRRCHRNUL */
-
-/* Define to 1 if you have the `strsep' function. */
-#define HAVE_STRSEP 1
-
-/* Define to 1 if the system has the type `struct stat'. */
-#define HAVE_STRUCT_STAT 1
-
-/* Define to 1 if the system has the type `struct _stati64'. */
-/* #undef HAVE_STRUCT__STATI64 */
-
-/* Define to 1 if you have the `sysconf' function. */
-#define HAVE_SYSCONF 1
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-/* #undef HAVE_SYS_ENDIAN_H */
-
-/* Define to 1 if you have the <sys/mman.h> header file. */
-#define HAVE_SYS_MMAN_H 1
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if the system has the type `uintptr_t'. */
-#define HAVE_UINTPTR_T 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to 1 if you have the `vsnprintf' function. */
-#define HAVE_VSNPRINTF 1
-
-/* Define to 1 if you have the `_access' function. */
-/* #undef HAVE__ACCESS */
-
-/* Define to 1 if you have the `_BitScanReverse' intrinsic function. */
-/* #undef HAVE__BITSCANREVERSE */
-
-/* Define to 1 if you have the `_BitScanReverse64' intrinsic function. */
-/* #undef HAVE__BITSCANREVERSE64 */
-
-/* Define to 1 if the system has the type `_Bool'. */
-#define HAVE__BOOL 1
-
-/* Define to 1 if you have the `_byteswap_uint64' intrinsic function. */
-/* #undef HAVE__BYTESWAP_UINT64 */
-
-/* Define to 1 if you have the `_byteswap_ulong' intrinsic function. */
-/* #undef HAVE__BYTESWAP_ULONG */
-
-/* Define to 1 if you have the `_byteswap_ushort' intrinsic function. */
-/* #undef HAVE__BYTESWAP_USHORT */
-
-/* Define to 1 if you have the `_chsize' function. */
-/* #undef HAVE__CHSIZE */
-
-/* Define to 1 if you have the `_chsize_s' function. */
-/* #undef HAVE__CHSIZE_S */
-
-/* Define to 1 if you have the `_filelengthi64' function. */
-/* #undef HAVE__FILELENGTHI64 */
-
-/* Define to 1 if you have the `_fileno' function. */
-/* #undef HAVE__FILENO */
-
-/* Define to 1 if you have the `_fseeki64' function. */
-/* #undef HAVE__FSEEKI64 */
-
-/* Define to 1 if you have the `_fstati64' function. */
-/* #undef HAVE__FSTATI64 */
-
-/* Define to 1 if you have the `_fullpath' function. */
-/* #undef HAVE__FULLPATH */
-
-/* Define to 1 if you have the `_snprintf' function. */
-/* #undef HAVE__SNPRINTF */
-
-/* Define to 1 if you have the `_stati64' function. */
-/* #undef HAVE__STATI64 */
-
-/* Define to 1 if you have the `_vsnprintf' function. */
-/* #undef HAVE__VSNPRINTF */
-
-/* Define to 1 if you have the `__bswap_16' intrinsic function. */
-/* #undef HAVE___BSWAP_16 */
-
-/* Define to 1 if you have the `__bswap_32' intrinsic function. */
-/* #undef HAVE___BSWAP_32 */
-
-/* Define to 1 if you have the `__bswap_64' intrinsic function. */
-/* #undef HAVE___BSWAP_64 */
-
-/* Define to 1 if you have the `__builtin_bswap16' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP16 1
-
-/* Define to 1 if you have the `__builtin_bswap32' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP32 1
-
-/* Define to 1 if you have the `__builtin_bswap64' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP64 1
-
-/* Define to 1 if you have the `__builtin_clz' intrinsic function. */
-#define HAVE___BUILTIN_CLZ 1
-
-/* Define to 1 if you have the `__builtin_clzl' intrinsic function. */
-#define HAVE___BUILTIN_CLZL 1
-
-/* Define to 1 if you have the `__builtin_clzll' intrinsic function. */
-#define HAVE___BUILTIN_CLZLL 1
-
-/* Define to 1 if you have the `__builtin_constant_p' intrinsic function. */
-#define HAVE___BUILTIN_CONSTANT_P 1
-
-/* Define to 1 if you have the `__builtin_expect' intrinsic function. */
-#define HAVE___BUILTIN_EXPECT 1
-
-/* Define to 1 if you have the `__cpu_to_le16' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE16 */
-
-/* Define to 1 if you have the `__cpu_to_le32' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE32 */
-
-/* Define to 1 if you have the `__cpu_to_le64' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE64 */
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME ""
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING ""
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME ""
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION ""
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Enable extensions on AIX 3, Interix.  */
-#ifndef _ALL_SOURCE
-#define _ALL_SOURCE 1
-#endif
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
-/* Enable threading extensions on Solaris.  */
-#ifndef _POSIX_PTHREAD_SEMANTICS
-#define _POSIX_PTHREAD_SEMANTICS 1
-#endif
-/* Enable extensions on HP NonStop.  */
-#ifndef _TANDEM_SOURCE
-#define _TANDEM_SOURCE 1
-#endif
-/* Enable general extensions on Solaris.  */
-#ifndef __EXTENSIONS__
-#define __EXTENSIONS__ 1
-#endif
-
-/* Define to 1 if your processor stores words with the most significant byte
-   first (like Motorola and SPARC, unlike Intel and VAX). */
-/* #undef WORDS_BIGENDIAN */
-
-/* Define to 1 if your processor stores words with the least significant byte
-   first (like Intel and VAX, unlike Motorola and SPARC). */
-#define WORDS_LITTLEENDIAN 1
-
-/* Enable large inode numbers on Mac OS X 10.5.  */
-#ifndef _DARWIN_USE_64_BIT_INODE
-#define _DARWIN_USE_64_BIT_INODE 1
-#endif
-
-/* Number of bits in a file offset, on hosts where this is settable. */
-/* #undef _FILE_OFFSET_BITS */
-
-/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-/* #undef _LARGEFILE_SOURCE */
-
-/* Define for large files, on AIX-style hosts. */
-/* #undef _LARGE_FILES */
-
-/* Define to 1 if on MINIX. */
-/* #undef _MINIX */
-
-/* Define to 2 if the system does not provide POSIX.1 features except with
-   this defined. */
-/* #undef _POSIX_1_SOURCE */
-
-/* Define to 1 if you need to in order for `stat' and other things to work. */
-/* #undef _POSIX_SOURCE */
-
-/* Define to empty if `const' does not conform to ANSI C. */
-/* #undef const */
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to the equivalent of the C99 'restrict' keyword, or to
-   nothing if this is not supported.  Do not define if restrict is
-   supported directly.  */
-#define restrict __restrict
-/* Work around a bug in Sun C++: it does not support _Restrict or
-   __restrict__, even though the corresponding Sun C compiler ends up with
-   "#define restrict _Restrict" or "#define restrict __restrict__" in the
-   previous line.  Perhaps some future version of Sun C++ will work with
-   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
-#if defined __SUNPRO_CC && !defined __RESTRICT
-#define _Restrict
-#define __restrict__
-#endif
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
-
-/* Define to the type of an unsigned integer type wide enough to hold a
-   pointer, if such a type exists, and if the system does not define it. */
-/* #undef uintptr_t */
diff --git a/third_party/xla/third_party/nasm/nasm.BUILD b/third_party/xla/third_party/nasm/nasm.BUILD
deleted file mode 100644
index a328d07ad5ad5b..00000000000000
--- a/third_party/xla/third_party/nasm/nasm.BUILD
+++ /dev/null
@@ -1,138 +0,0 @@
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-INCLUDES = [
-    ".",
-    "include",
-    "x86",
-    "asm",
-    "disasm",
-    "output",
-]
-
-COPTS = select({
-    ":windows": [],
-    "//conditions:default": [
-        "-w",
-        "-DHAVE_CONFIG_H",
-    ],
-})
-
-cc_library(
-    name = "nasm_2_14_02",
-    srcs = [
-        "asm/assemble.c",
-        "asm/directbl.c",
-        "asm/directiv.c",
-        "asm/error.c",
-        "asm/eval.c",
-        "asm/exprdump.c",
-        "asm/exprlib.c",
-        "asm/float.c",
-        "asm/labels.c",
-        "asm/listing.c",
-        "asm/parser.c",
-        "asm/pptok.c",
-        "asm/pragma.c",
-        "asm/preproc.c",
-        "asm/preproc-nop.c",
-        "asm/quote.c",
-        "asm/rdstrnum.c",
-        "asm/segalloc.c",
-        "asm/stdscan.c",
-        "asm/strfunc.c",
-        "asm/tokhash.c",
-        "common/common.c",
-        "disasm/disasm.c",
-        "disasm/sync.c",
-        "macros/macros.c",
-        "nasmlib/badenum.c",
-        "nasmlib/bsi.c",
-        "nasmlib/crc64.c",
-        "nasmlib/errfile.c",
-        "nasmlib/file.c",
-        "nasmlib/filename.c",
-        "nasmlib/hashtbl.c",
-        "nasmlib/ilog2.c",
-        "nasmlib/malloc.c",
-        "nasmlib/md5c.c",
-        "nasmlib/mmap.c",
-        "nasmlib/path.c",
-        "nasmlib/perfhash.c",
-        "nasmlib/raa.c",
-        "nasmlib/rbtree.c",
-        "nasmlib/readnum.c",
-        "nasmlib/realpath.c",
-        "nasmlib/saa.c",
-        "nasmlib/srcfile.c",
-        "nasmlib/string.c",
-        "nasmlib/strlist.c",
-        "nasmlib/ver.c",
-        "output/codeview.c",
-        "output/legacy.c",
-        "output/nulldbg.c",
-        "output/nullout.c",
-        "output/outaout.c",
-        "output/outas86.c",
-        "output/outbin.c",
-        "output/outcoff.c",
-        "output/outdbg.c",
-        "output/outelf.c",
-        "output/outform.c",
-        "output/outieee.c",
-        "output/outlib.c",
-        "output/outmacho.c",
-        "output/outobj.c",
-        "output/outrdf2.c",
-        "output/strtbl.c",
-        "stdlib/snprintf.c",
-        "stdlib/strlcpy.c",
-        "stdlib/strnlen.c",
-        "stdlib/strrchrnul.c",
-        "stdlib/vsnprintf.c",
-        "x86/disp8.c",
-        "x86/iflag.c",
-        "x86/insnsa.c",
-        "x86/insnsb.c",
-        "x86/insnsd.c",
-        "x86/insnsn.c",
-        "x86/regdis.c",
-        "x86/regflags.c",
-        "x86/regs.c",
-        "x86/regvals.c",
-    ],
-    hdrs = glob([
-        "*.h",
-        "include/*.h",
-        "x86/*.h",
-        "disasm/*.h",
-        "config/*.h",
-        "asm/*.h",
-        "output/*.h",
-        "nasmlib/*.h",
-    ]),
-    copts = COPTS,
-    includes = INCLUDES,
-)
-
-cc_binary(
-    name = "nasm",
-    srcs = [
-        "asm/nasm.c",
-        "nasmlib/zerobuf.c",
-    ],
-    copts = COPTS,
-    includes = INCLUDES,
-    visibility = ["@libjpeg_turbo//:__pkg__"],
-    deps = [
-        ":nasm_2_14_02",
-    ],
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
diff --git a/third_party/xla/third_party/nasm/workspace.bzl b/third_party/xla/third_party/nasm/workspace.bzl
deleted file mode 100644
index 5806cba557ddf9..00000000000000
--- a/third_party/xla/third_party/nasm/workspace.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-"""loads the nasm library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive")
-
-def repo():
-    tf_http_archive(
-        name = "nasm",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.14.02/nasm-2.14.02.tar.bz2",
-            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.14.02.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.14.02.tar.bz2",
-            "http://www.nasm.us/pub/nasm/releasebuilds/2.14.02/nasm-2.14.02.tar.bz2",
-        ],
-        sha256 = "34fd26c70a277a9fdd54cb5ecf389badedaf48047b269d1008fbc819b24e80bc",
-        strip_prefix = "nasm-2.14.02",
-        build_file = "//third_party/nasm:nasm.BUILD",
-        system_build_file = "//third_party/nasm:BUILD.system",
-        link_files = {"//third_party/nasm:config.h": "config/config.h"},
-    )
diff --git a/third_party/xla/third_party/nccl/BUILD b/third_party/xla/third_party/nccl/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/nccl/LICENSE b/third_party/xla/third_party/nccl/LICENSE
deleted file mode 100644
index b9585181860564..00000000000000
--- a/third_party/xla/third_party/nccl/LICENSE
+++ /dev/null
@@ -1,30 +0,0 @@
-
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in the
-    documentation and/or other materials provided with the distribution.
-  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
-    Laboratory, the U.S. Department of Energy, nor the names of their
-    contributors may be used to endorse or promote products derived
-    from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- The U.S. Department of Energy funded the development of this software
- under subcontract 7078610 with Lawrence Berkeley National Laboratory.
diff --git a/third_party/xla/third_party/nccl/archive.BUILD b/third_party/xla/third_party/nccl/archive.BUILD
deleted file mode 100644
index 05293fd50bdcf2..00000000000000
--- a/third_party/xla/third_party/nccl/archive.BUILD
+++ /dev/null
@@ -1,245 +0,0 @@
-# NVIDIA NCCL 2
-# A package of optimized primitives for collective multi-GPU communication.
-
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "cuda_library",
-)
-load(
-    "@local_config_nccl//:build_defs.bzl",
-    "cuda_rdc_library",
-    "gen_device_srcs",
-)
-
-NCCL_MAJOR = 2
-
-NCCL_MINOR = 16
-
-NCCL_PATCH = 5
-
-NCCL_VERSION = NCCL_MAJOR * 10000 + NCCL_MINOR * 100 + NCCL_PATCH  # e.g., 21605
-
-expand_template(
-    name = "nccl_header_version",
-    out = "src/nccl.h",
-    substitutions = {
-        "${nccl:Major}": str(NCCL_MAJOR),
-        "${nccl:Minor}": str(NCCL_MINOR),
-        "${nccl:Patch}": str(NCCL_PATCH),
-        "${nccl:Suffix}": "\"\"",
-        "${nccl:Version}": str(NCCL_VERSION),
-    },
-    template = "src/nccl.h.in",
-)
-
-# This additional header allows us to determine the configured NCCL version
-# without including the rest of NCCL.
-write_file(
-    name = "nccl_config_header",
-    out = "nccl_config.h",
-    content = [
-        "#define TF_NCCL_VERSION \"{}\"".format(NCCL_MAJOR),
-    ],
-)
-
-cc_library(
-    name = "nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "src_hdrs",
-    hdrs = [
-        "src/include/collectives.h",
-        "src/nccl.h",
-    ],
-    strip_include_prefix = "src",
-)
-
-cc_library(
-    name = "include_hdrs",
-    hdrs = glob(["src/include/**"]),
-    strip_include_prefix = "src/include",
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
-
-cc_library(
-    name = "device_hdrs",
-    hdrs = glob(["src/collectives/device/*.h"]),
-    strip_include_prefix = "src/collectives/device",
-)
-
-# NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
-# RDC compilation requires that each compiled module has a unique ID. Clang
-# derives the module ID from the path only so we need to copy the files to get
-# different IDs for different parts of compilation. NVCC does not have that
-# problem because it generates IDs based on preprocessed content.
-gen_device_srcs(
-    name = "device_srcs",
-    srcs = [
-        "src/collectives/device/all_gather.cu.cc",
-        "src/collectives/device/all_reduce.cu.cc",
-        "src/collectives/device/broadcast.cu.cc",
-        "src/collectives/device/reduce.cu.cc",
-        "src/collectives/device/reduce_scatter.cu.cc",
-        "src/collectives/device/sendrecv.cu.cc",
-    ],
-)
-
-cuda_rdc_library(
-    name = "device",
-    srcs = [
-        "src/collectives/device/functions.cu.cc",
-        "src/collectives/device/onerank_reduce.cu.cc",
-        ":device_srcs",
-    ] + glob([
-        # Required for header inclusion checking, see below for details.
-        "src/collectives/device/*.h",
-        "src/nccl.h",
-    ]),
-    deps = [
-        ":device_hdrs",
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-cc_library(
-    name = "net",
-    srcs = [
-        "src/transport/coll_net.cc",
-        "src/transport/net.cc",
-    ],
-    linkopts = ["-lrt"],
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
-
-cc_library(
-    name = "nccl_via_stub",
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/cuda:nccl_stub",
-    ],
-)
-
-cc_library(
-    name = "nccl_headers",
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = glob(
-        include = [
-            "src/**/*.cc",
-            # Required for header inclusion checking, see below for details.
-            "src/graph/*.h",
-        ],
-        # Exclude device-library code.
-        exclude = [
-            "src/collectives/device/**",
-            "src/transport/coll_net.cc",
-            "src/transport/net.cc",
-            "src/enqueue.cc",
-        ],
-    ) + [
-        # Required for header inclusion checking (see
-        # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
-        # Files in src/ which #include "nccl.h" load it from there rather than
-        # from the virtual includes directory.
-        "src/include/collectives.h",
-        "src/nccl.h",
-    ],
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":enqueue",
-        ":include_hdrs",
-        ":net",
-        ":src_hdrs",
-    ],
-)
-
-alias(
-    name = "enqueue",
-    actual = select({
-        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
-        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
-    }),
-)
-
-# Kernels and their names have special treatment in CUDA compilation.
-# Specifically, the host-side kernel launch stub (host-side representation of
-# the kernel) ends up having the name which does not match the actual kernel
-# name. In order to correctly refer to the kernel the referring code must be
-# compiled as CUDA.
-cuda_library(
-    name = "enqueue_clang",
-    srcs = [
-        "src/enqueue.cc",
-    ],
-    hdrs = ["src/nccl.h"],
-    copts = [
-        "--cuda-host-only",
-    ],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    strip_include_prefix = "src",
-    target_compatible_with = select({
-        "@local_config_cuda//cuda:using_clang": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
-
-cc_library(
-    name = "enqueue_nvcc",
-    srcs = [
-        "src/enqueue.cc",
-    ],
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    strip_include_prefix = "src",
-    target_compatible_with = select({
-        "@local_config_cuda//cuda:using_nvcc": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
diff --git a/third_party/xla/third_party/nccl/archive.patch b/third_party/xla/third_party/nccl/archive.patch
deleted file mode 100644
index f951a6a4dde608..00000000000000
--- a/third_party/xla/third_party/nccl/archive.patch
+++ /dev/null
@@ -1,59 +0,0 @@
-diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
-similarity index 100%
-rename from src/collectives/device/all_gather.cu
-rename to src/collectives/device/all_gather.cu.cc
-diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
-similarity index 100%
-rename from src/collectives/device/all_reduce.cu
-rename to src/collectives/device/all_reduce.cu.cc
-diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
-similarity index 100%
-rename from src/collectives/device/broadcast.cu
-rename to src/collectives/device/broadcast.cu.cc
-diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
-similarity index 100%
-rename from src/collectives/device/functions.cu
-rename to src/collectives/device/functions.cu.cc
-diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu.cc
-similarity index 100%
-rename from src/collectives/device/onerank_reduce.cu
-rename to src/collectives/device/onerank_reduce.cu.cc
-diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
-similarity index 100%
-rename from src/collectives/device/reduce.cu
-rename to src/collectives/device/reduce.cu.cc
-diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
-similarity index 100%
-rename from src/collectives/device/reduce_scatter.cu
-rename to src/collectives/device/reduce_scatter.cu.cc
-diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu.cc
-similarity index 100%
-rename from src/collectives/device/sendrecv.cu
-rename to src/collectives/device/sendrecv.cu.cc
-diff --git a/src/include/nvtx.h b/src/include/nvtx.h
-index 2aeb932..cdc67d2 100644
---- a/src/include/nvtx.h
-+++ b/src/include/nvtx.h
-@@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
-
- class payload_schema {
-  public:
--  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-   {
-     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
-diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
-index accf8371a..4ab1bfac6 100644
---- a/src/collectives/device/common.h
-+++ b/src/collectives/device/common.h
-@@ -166,7 +166,8 @@ __device__ void ncclKernel(
-       bytes = 0;
-       break;
-     }
--    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
-+    if (bytes)
-+      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
-   }
-   __syncthreads(); // publish ncclShmem
- 
\ No newline at end of file
diff --git a/third_party/xla/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
deleted file mode 100644
index 04749bedbec3ae..00000000000000
--- a/third_party/xla/third_party/nccl/build_defs.bzl.tpl
+++ /dev/null
@@ -1,412 +0,0 @@
-"""Repository rule for NCCL."""
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
-load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
-
-# CUDA toolkit version as tuple (e.g. '(11, 1)').
-_cuda_version = %{cuda_version}
-_cuda_clang = %{cuda_clang}
-
-def _gen_device_srcs_impl(ctx):
-    ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
-    # TF uses CUDA version > 11.0, so enable bf16 type unconditionally.
-    types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "bf16", "f32", "f64"]
-    hdr_tail = "****************************************/"
-    defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
-
-    files = []
-    for NCCL_OP, op in enumerate(ops):
-        for NCCL_TYPE, dt in enumerate(types):
-            substitutions = {
-                hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
-            }
-            for src in ctx.files.srcs:
-                name = "%s_%s_%s" % (op, dt, src.basename)
-                file = ctx.actions.declare_file(name, sibling = src)
-                ctx.actions.expand_template(
-                    output = file,
-                    template = src,
-                    substitutions = substitutions,
-                )
-                files.append(file)
-    return [DefaultInfo(files = depset(files))]
-
-gen_device_srcs = rule(
-    implementation = _gen_device_srcs_impl,
-    attrs = {
-        "srcs": attr.label_list(allow_files = True),
-    },
-)
-"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
-
-def _rdc_copts():
-    """Returns copts for compiling relocatable device code."""
-
-    # The global functions can not have a lower register count than the
-    # device functions. This is enforced by setting a fixed register count.
-    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
-    maxrregcount = "-maxrregcount=96"
-
-    return cuda_default_copts() + select({
-        "@local_config_cuda//:is_cuda_compiler_nvcc": [
-            "-nvcc_options",
-            "relocatable-device-code=true",
-            "-nvcc_options",
-            "ptxas-options=" + maxrregcount,
-            "-nvcc_options",
-            "extended-lambda",
-        ],
-        "@local_config_cuda//:is_cuda_compiler_clang": [
-            "-fcuda-rdc",
-            "-Xcuda-ptxas",
-            maxrregcount,
-        ],
-        "//conditions:default": [],
-    })
-
-def _lookup_file(filegroup, path):
-    """Extracts file at (relative) path in filegroup."""
-    for file in filegroup.files:
-        if file.path.endswith(path):
-            return file
-    return None
-
-def _pic_only(files):
-    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
-    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
-    return pic_only if pic_only else files
-
-def _device_link_impl(ctx):
-    if not ctx.attr.gpu_archs:
-        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
-
-    inputs = []
-    for dep in ctx.attr.deps:
-        inputs += dep.files.to_list()
-    inputs = _pic_only(inputs)
-
-    # Device-link to cubins for each architecture.
-    name = ctx.attr.name
-    register_h = None
-    cubins = []
-    images = []
-    for arch in ctx.attr.gpu_archs:
-        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
-        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
-        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
-        ctx.actions.run(
-            outputs = [register_h, cubin],
-            inputs = inputs,
-            executable = ctx.file._nvlink,
-            arguments = ctx.attr.nvlink_args + [
-                "--arch=%s" % arch,
-                "--register-link-binaries=%s" % register_h.path,
-                "--output-file=%s" % cubin.path,
-            ] + [file.path for file in inputs],
-            mnemonic = "nvlink",
-            use_default_shell_env = True,
-        )
-        cubins.append(cubin)
-        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
-
-    # Generate fatbin header from all cubins.
-    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
-    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
-    bin2c = ctx.file._bin2c
-    arguments_list = [
-        "-64",
-        "--cmdline=--compile-only",
-        "--link",
-        "--compress-all",
-        "--create=%s" % tmp_fatbin.path,
-        "--embedded-fatbin=%s" % fatbin_h.path,
-    ]
-    if _cuda_version <= (10, 1):
-        arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
-    ctx.actions.run(
-        outputs = [tmp_fatbin, fatbin_h],
-        inputs = cubins,
-        executable = ctx.file._fatbinary,
-        arguments = arguments_list + images,
-        tools = [bin2c],
-        mnemonic = "fatbinary",
-        use_default_shell_env = True,
-    )
-
-    # Generate the source file #including the headers generated above.
-    ctx.actions.expand_template(
-        output = ctx.outputs.out,
-        template = ctx.file._link_stub,
-        substitutions = {
-            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
-            "FATBINFILE": '"%s"' % fatbin_h.short_path,
-        },
-    )
-
-    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
-
-_device_link = rule(
-    implementation = _device_link_impl,
-    attrs = {
-        "deps": attr.label_list(),
-        "out": attr.output(mandatory = True),
-        "gpu_archs": attr.string_list(),
-        "nvlink_args": attr.string_list(),
-        "_nvlink": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_fatbinary": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_bin2c": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_link_stub": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
-            allow_single_file = True,
-        ),
-    },
-)
-"""Links device code and generates source code for kernel registration."""
-
-def _prune_relocatable_code_impl(ctx):
-    """Clears __nv_relfatbin section containing relocatable device code."""
-
-    if _cuda_version < (11, 3):
-        # -no-relocatable-elf not supported, return unpruned input.
-        return ctx.attr.input[DefaultInfo]
-
-    # nvcc --generate-code options for the active set of cuda architectures.
-    gencodes = []
-    for code in ctx.attr.gpu_archs:
-        arch = code.replace("compute_", "sm_")
-        if code != arch:
-            gencodes.append((arch, arch))
-        gencodes.append((arch, code))
-
-    outputs = []
-    for input in ctx.files.input:
-        output = ctx.actions.declare_file(
-            "pruned_" + input.basename,
-            sibling = input,
-        )
-        arguments = (
-            ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
-            ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
-        )
-        ctx.actions.run(
-            outputs = [output],
-            inputs = [input],
-            executable = ctx.file._nvprune,
-            arguments = arguments,
-            mnemonic = "nvprune",
-            use_default_shell_env = True,
-        )
-        outputs.append(output)
-
-    return DefaultInfo(files = depset(outputs))
-
-_prune_relocatable_code = rule(
-    implementation = _prune_relocatable_code_impl,
-    attrs = {
-        "input": attr.label(mandatory = True, allow_files = True),
-        "gpu_archs": attr.string_list(),
-        "_nvprune": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-    },
-)
-
-def _merge_archive_impl(ctx):
-    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
-    # See https://stackoverflow.com/a/23621751.
-    files = _pic_only(ctx.files.srcs)
-    mri_script = "create " + ctx.outputs.out.path
-    for f in files:
-        mri_script += r"\naddlib " + f.path
-    mri_script += r"\nsave\nend"
-
-    cc_toolchain = find_cpp_toolchain(ctx)
-    ctx.actions.run_shell(
-        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
-        outputs = [ctx.outputs.out],
-        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
-        use_default_shell_env = True,
-    )
-
-_merge_archive = rule(
-    implementation = _merge_archive_impl,
-    attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(
-            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
-        ),
-        # "_crosstool": attr.label_list(
-        #     cfg = "host",
-        #     default = ["@bazel_tools//tools/cpp:crosstool"]
-        # ),
-    },
-    outputs = {"out": "lib%{name}.a"},
-)
-"""Merges srcs into a single archive."""
-
-def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
-    r"""Produces a cuda_library using separate compilation and linking.
-
-    CUDA separate compilation and linking allows device function calls across
-    translation units. This is different from the normal whole program
-    compilation where each translation unit contains all device code. For more
-    background, see
-    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
-    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
-
-    During separate compilation, the different CUDA source files are compiled
-    to 'relocatable device code' (RDC) and embedded in the host object files.
-    When using nvcc, linking the device code for each supported GPU
-    architecture and generating kernel registration code for the CUDA runtime
-    is handled automatically. Clang supports generating relocatable device
-    code, but it can't link it. We therefore rely on tools provided by the CUDA
-    SDK to link the device code and generate the host code to register the
-    kernels.
-
-    The nvlink tool extracts the RDC code from the object files and links it
-    into cubin files, one per GPU architecture. It also produces a header file
-    with a list of kernel names to register. The cubins are merged into a
-    binary blob using the fatbinary tool, and converted to a C header file with
-    the help of the bin2c tool. The registration header file, the fatbinary
-    header file, and the link.stub file (shipped with the CUDA SDK) are
-    compiled as ordinary host code.
-
-    Here is a diagram of the CUDA separate compilation trajectory:
-
-     x.cu.cc    y.cu.cc
-           \    /            cc_library (compile RDC and archive)
-            xy.a
-           /    \            * nvlink
-    register.h  xy.cubin
-          :      |           * fatbinary and bin2c
-          :     xy.fatbin.h
-          :      :           * #include
-          dlink.cc           * Expanded from crt/dlink.stub template
-             |               cc_library (host compile and archive)
-          dlink.a
-
-    The steps marked with '*' are implemented in the _device_link rule.
-
-    The intermediate relocatable device code in xy.a is no longer needed at
-    this point and the corresponding section is replaced with an empty one using
-    objcopy. We do not remove the section completely because it is referenced by
-    relocations, and removing those as well breaks fatbin registration.
-
-    The object files in both xy.a and dlink.a reference symbols defined in the
-    other archive. The separate archives are a side effect of using two
-    cc_library targets to implement a single compilation trajectory. We could
-    fix this once bazel supports C++ sandwich. For now, we just merge the two
-    archives to avoid unresolved symbols:
-
-                    xy.a
-                     |         objcopy --update-section __nv_relfatbin=''
-    dlink.a     xy_pruned.a
-         \           /         merge archive
-          xy_merged.a
-              |                cc_library (or alternatively, cc_import)
-         final target
-
-    Another complication is that cc_library produces (depending on the
-    configuration) both PIC and non-PIC archives, but the distinction
-    is hidden from Starlark until C++ sandwich becomes available. We work
-    around this by dropping the non-PIC files if PIC files are available.
-
-    Args:
-      name: Target name.
-      hdrs: Header files.
-      copts: Compiler options.
-      linkstatic: Must be true.
-      **kwargs: Any other arguments.
-    """
-
-    if not hdrs:
-        hdrs = []
-    if not copts:
-        copts = []
-
-    # Compile host and device code into library.
-    lib = name + "_lib"
-    native.cc_library(
-        name = lib,
-        hdrs = hdrs,
-        copts = _rdc_copts() + copts,
-        linkstatic = linkstatic,
-        **kwargs
-    )
-
-    # Generate source file containing linked device code.
-    dlink_hdrs = name + "_dlink_hdrs"
-    dlink_cc = name + "_dlink.cc"
-    _device_link(
-        name = dlink_hdrs,
-        deps = [lib],
-        out = dlink_cc,
-        gpu_archs = cuda_gpu_architectures(),
-        nvlink_args = select({
-            "@local_tsl//tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_tsl//tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
-            "//conditions:default": [],
-        }),
-    )
-
-    # Compile the source file into a library.
-    dlink = name + "_dlink"
-    native.cc_library(
-        name = dlink,
-        srcs = [dlink_cc],
-        textual_hdrs = [dlink_hdrs],
-        deps = [
-            "@local_config_cuda//cuda:cuda_headers",
-        ],
-        defines = [
-            # Silence warning about including internal header.
-            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
-            # Macros that need to be defined starting with CUDA 10.
-            "__NV_EXTRA_INITIALIZATION=",
-            "__NV_EXTRA_FINALIZATION=",
-        ],
-        linkstatic = linkstatic,
-    )
-
-    # Remove intermediate relocatable device code.
-    pruned = name + "_pruned"
-    _prune_relocatable_code(
-        name = pruned,
-        input = lib,
-        gpu_archs = cuda_gpu_architectures(),
-    )
-
-    # Repackage the two libs into a single archive. This is required because
-    # both libs reference symbols defined in the other one. For details, see
-    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    merged = name + "_merged"
-    _merge_archive(
-        name = merged,
-        srcs = [pruned, dlink],
-    )
-
-    # Create cc target from archive.
-    native.cc_library(
-        name = name,
-        srcs = [merged],
-        hdrs = hdrs,
-        linkstatic = linkstatic,
-    )
diff --git a/third_party/xla/third_party/nccl/nccl_configure.bzl b/third_party/xla/third_party/nccl/nccl_configure.bzl
deleted file mode 100644
index 3ae8abace055b1..00000000000000
--- a/third_party/xla/third_party/nccl/nccl_configure.bzl
+++ /dev/null
@@ -1,212 +0,0 @@
-"""Repository rule for NCCL configuration.
-
-`nccl_configure` depends on the following environment variables:
-
-  * `TF_NCCL_VERSION`: Installed NCCL version or empty to build from source.
-  * `NCCL_INSTALL_PATH` (deprecated): The installation path of the NCCL library.
-  * `NCCL_HDR_PATH` (deprecated): The installation path of the NCCL header 
-    files.
-  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
-    `/usr/local/cuda,usr/`.
-  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
-  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
-    be used, "0" if NCCL should be linked in statically.
-
-"""
-
-load(
-    "//third_party/gpus:cuda_configure.bzl",
-    "enable_cuda",
-    "find_cuda_config",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "get_cpu_value",
-    "get_host_environ",
-)
-
-_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
-_NCCL_HDR_PATH = "NCCL_HDR_PATH"
-_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-_TF_NCCL_VERSION = "TF_NCCL_VERSION"
-_TF_NEED_CUDA = "TF_NEED_CUDA"
-_TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_CUDA_CLANG = "TF_CUDA_CLANG"
-_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
-
-_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
-_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
-_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
-
-_NCCL_DUMMY_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl_config",
-  hdrs = ["nccl_config.h"],
-  include_prefix = "third_party/nccl",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = "@nccl_archive//:nccl",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = "@nccl_archive//:nccl_config",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = "@nccl_archive//:nccl_via_stub",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_headers",
-  actual = "@nccl_archive//:nccl_headers",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = "@nccl_archive//:nccl_config",
-  visibility = ["//visibility:public"],
-)
-"""
-
-def _label(file):
-    return Label("//third_party/nccl:{}".format(file))
-
-def _create_local_nccl_repository(repository_ctx):
-    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
-    if nccl_version:
-        nccl_version = nccl_version.split(".")[0]
-
-    cuda_config = find_cuda_config(repository_ctx, ["cuda"])
-    cuda_version = cuda_config["cuda_version"].split(".")
-
-    if nccl_version == "":
-        # Alias to open source build from @nccl_archive.
-        if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
-            repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
-        else:
-            repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
-
-        repository_ctx.template(
-            "build_defs.bzl",
-            _label("build_defs.bzl.tpl"),
-            {
-                "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
-            },
-        )
-    else:
-        # Create target for locally installed NCCL.
-        config = find_cuda_config(repository_ctx, ["nccl"])
-        config_wrap = {
-            "%{nccl_version}": config["nccl_version"],
-            "%{nccl_header_dir}": config["nccl_include_dir"],
-            "%{nccl_library_dir}": config["nccl_library_dir"],
-        }
-        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
-
-def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
-    repository_ctx.template(
-        "BUILD",
-        config_repo_label(remote_config_repo, ":BUILD"),
-        {},
-    )
-
-    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
-    if nccl_version == "":
-        repository_ctx.template(
-            "build_defs.bzl",
-            config_repo_label(remote_config_repo, ":build_defs.bzl"),
-            {},
-        )
-
-def _nccl_autoconf_impl(repository_ctx):
-    if (not enable_cuda(repository_ctx) or
-        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
-        # Add a dummy build file to make bazel query happy.
-        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
-    elif get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO") != None:
-        _create_remote_nccl_repository(repository_ctx, get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO"))
-    else:
-        _create_local_nccl_repository(repository_ctx)
-
-_ENVIRONS = [
-    _CUDA_TOOLKIT_PATH,
-    _NCCL_HDR_PATH,
-    _NCCL_INSTALL_PATH,
-    _TF_NCCL_VERSION,
-    _TF_CUDA_COMPUTE_CAPABILITIES,
-    _TF_NEED_CUDA,
-    _TF_CUDA_PATHS,
-    _TF_CUDA_CLANG,
-]
-
-remote_nccl_configure = repository_rule(
-    implementation = _create_local_nccl_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-
-nccl_configure = repository_rule(
-    implementation = _nccl_autoconf_impl,
-    environ = _ENVIRONS,
-    attrs = {
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-"""Detects and configures the NCCL configuration.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-nccl_configure(name = "local_config_nccl")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/nccl/system.BUILD.tpl b/third_party/xla/third_party/nccl/system.BUILD.tpl
deleted file mode 100644
index 6e2a22a950bb5a..00000000000000
--- a/third_party/xla/third_party/nccl/system.BUILD.tpl
+++ /dev/null
@@ -1,62 +0,0 @@
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags"
-)
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = ["libnccl.so.%{nccl_version}"],
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-    linkopts = cuda_rpath_flags("nvidia/nccl/lib"),
-)
-
-cc_library(
-    name = "nccl_headers",
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-genrule(
-    name = "nccl-files",
-    outs = [
-        "libnccl.so.%{nccl_version}",
-        "nccl.h",
-    ],
-    cmd = """
-cp "%{nccl_header_dir}/nccl.h" "$(@D)/nccl.h" &&
-cp "%{nccl_library_dir}/libnccl.so.%{nccl_version}" \
-  "$(@D)/libnccl.so.%{nccl_version}"
-""",
-)
-
-# This additional header allows us to determine the configured NCCL version
-# without including the rest of NCCL.
-write_file(
-    name = "nccl_config_header",
-    out = "nccl_config.h",
-    content = [
-        "#define TF_NCCL_VERSION \"%{nccl_version}\""
-    ]
-)
-
-cc_library(
-    name = "nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/nsync.patch b/third_party/xla/third_party/nsync.patch
deleted file mode 100644
index b76630392d0a16..00000000000000
--- a/third_party/xla/third_party/nsync.patch
+++ /dev/null
@@ -1,9 +0,0 @@
-The "version" file at the root of the nsync source tree conflicts with the C++20
-"version" header in Windows builds.
-diff --git a/VERSION b/VERSION
-deleted file mode 100644
-index 53cc1a6..0000000
---- a/VERSION
-+++ /dev/null
-@@ -1 +0,0 @@
--1.25.0
diff --git a/third_party/xla/third_party/nvtx.BUILD b/third_party/xla/third_party/nvtx.BUILD
deleted file mode 100644
index d3d42582fefce7..00000000000000
--- a/third_party/xla/third_party/nvtx.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-#Description : NVIDIA Tools Extension (NVTX) library for adding profiling annotations to applications.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["restricted"])  # NVIDIA proprietary license
-
-filegroup(
-    name = "nvtx_header_files",
-    srcs = glob([
-        "nvtx3/**",
-    ]),
-)
-
-cc_library(
-    name = "nvtx",
-    hdrs = [":nvtx_header_files"],
-    include_prefix = "third_party",
-)
diff --git a/third_party/xla/third_party/png.BUILD b/third_party/xla/third_party/png.BUILD
deleted file mode 100644
index 0e2a5d0e3c427e..00000000000000
--- a/third_party/xla/third_party/png.BUILD
+++ /dev/null
@@ -1,70 +0,0 @@
-# Description:
-#   libpng is the official PNG reference library.
-
-licenses(["notice"])  # BSD/MIT-like license
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "png",
-    srcs = [
-        "png.c",
-        "pngdebug.h",
-        "pngerror.c",
-        "pngget.c",
-        "pnginfo.h",
-        "pnglibconf.h",
-        "pngmem.c",
-        "pngpread.c",
-        "pngpriv.h",
-        "pngread.c",
-        "pngrio.c",
-        "pngrtran.c",
-        "pngrutil.c",
-        "pngset.c",
-        "pngstruct.h",
-        "pngtrans.c",
-        "pngwio.c",
-        "pngwrite.c",
-        "pngwtran.c",
-        "pngwutil.c",
-    ] + select({
-        ":windows": [
-            "intel/filter_sse2_intrinsics.c",
-            "intel/intel_init.c",
-        ],
-        "@local_tsl//tsl:linux_ppc64le": [
-            "powerpc/filter_vsx_intrinsics.c",
-            "powerpc/powerpc_init.c",
-        ],
-        "//conditions:default": [
-        ],
-    }),
-    hdrs = [
-        "png.h",
-        "pngconf.h",
-    ],
-    copts = select({
-        ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
-        "//conditions:default": [],
-    }),
-    includes = ["."],
-    linkopts = select({
-        ":windows": [],
-        "//conditions:default": ["-lm"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = ["@zlib"],
-)
-
-genrule(
-    name = "snappy_stubs_public_h",
-    srcs = ["scripts/pnglibconf.h.prebuilt"],
-    outs = ["pnglibconf.h"],
-    cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12d0/' $< >$@",
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-)
diff --git a/third_party/xla/third_party/png_fix_rpi.patch b/third_party/xla/third_party/png_fix_rpi.patch
deleted file mode 100644
index df6cfd7ffaee55..00000000000000
--- a/third_party/xla/third_party/png_fix_rpi.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-diff -r -u ./scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
---- ./scripts/pnglibconf.h.prebuilt
-+++ ./scripts/pnglibconf.h.prebuilt
-@@ -19,6 +19,12 @@
- #define PNG_ALIGNED_MEMORY_SUPPORTED
- /*#undef PNG_ARM_NEON_API_SUPPORTED*/
- /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
-+
-+/* Workaround not having a great build file by forcing
-+ * png filter optimization to be disabled on arm */
-+#define PNG_ARM_NEON_OPT 0
-+
-+
- #define PNG_BENIGN_ERRORS_SUPPORTED
- #define PNG_BENIGN_READ_ERRORS_SUPPORTED
- /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/
diff --git a/third_party/xla/third_party/protobuf/BUILD b/third_party/xla/third_party/protobuf/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/protobuf/protobuf.patch b/third_party/xla/third_party/protobuf/protobuf.patch
deleted file mode 100644
index 9d928ba175f330..00000000000000
--- a/third_party/xla/third_party/protobuf/protobuf.patch
+++ /dev/null
@@ -1,141 +0,0 @@
-diff --git a/BUILD.bazel b/BUILD.bazel
---- a/BUILD.bazel	(revision 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66)
-+++ b/BUILD.bazel	(date 1670471682469)
-@@ -68,6 +68,7 @@
-     copts = COPTS,
-     includes = ["src/"],
-     linkopts = LINK_OPTS,
-+    alwayslink = 1,
-     visibility = ["//visibility:public"],
- )
-
-@@ -135,6 +136,7 @@
-     copts = COPTS,
-     includes = ["src/"],
-     linkopts = LINK_OPTS,
-+    alwayslink = 1,
-     visibility = ["//visibility:public"],
-     deps = [":protobuf_lite"] + select({
-         "//build_defs:config_msvc": [],
-diff --git a/python/google/protobuf/pyext/descriptor.cc b/python/google/protobuf/pyext/descriptor.cc
-index 162531226..e93ec4809 100644
---- a/python/google/protobuf/pyext/descriptor.cc
-+++ b/python/google/protobuf/pyext/descriptor.cc
-@@ -58,6 +58,37 @@
-               : 0)                                               \
-        : PyBytes_AsStringAndSize(ob, (charpp), (sizep)))
- 
-+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-+static PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
-+{
-+    Py_INCREF(frame->f_code);
-+    return frame->f_code;
-+}
-+
-+static PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
-+{
-+    Py_XINCREF(frame->f_back);
-+    return frame->f_back;
-+}
-+#endif
-+
-+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
-+static PyObject* PyFrame_GetLocals(PyFrameObject *frame)
-+{
-+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
-+        return NULL;
-+    }
-+    Py_INCREF(frame->f_locals);
-+    return frame->f_locals;
-+}
-+
-+static PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
-+{
-+    Py_INCREF(frame->f_globals);
-+    return frame->f_globals;
-+}
-+#endif
-+
- namespace google {
- namespace protobuf {
- namespace python {
-@@ -96,48 +127,66 @@ bool _CalledFromGeneratedFile(int stacklevel) {
-   // This check is not critical and is somewhat difficult to implement correctly
-   // in PyPy.
-   PyFrameObject* frame = PyEval_GetFrame();
-+  PyCodeObject* frame_code = nullptr;
-+  PyObject* frame_globals = nullptr;
-+  PyObject* frame_locals = nullptr;
-+  bool result = false;
-+
-   if (frame == nullptr) {
--    return false;
-+    goto exit;
-   }
-+  Py_INCREF(frame);
-   while (stacklevel-- > 0) {
--    frame = frame->f_back;
-+    PyFrameObject* next_frame = PyFrame_GetBack(frame);
-+    Py_DECREF(frame);
-+    frame = next_frame;
-     if (frame == nullptr) {
--      return false;
-+      goto exit;
-     }
-   }
- 
--  if (frame->f_code->co_filename == nullptr) {
--    return false;
-+  frame_code = PyFrame_GetCode(frame);
-+  if (frame_code->co_filename == nullptr) {
-+    goto exit;
-   }
-   char* filename;
-   Py_ssize_t filename_size;
--  if (PyString_AsStringAndSize(frame->f_code->co_filename,
-+  if (PyString_AsStringAndSize(frame_code->co_filename,
-                                &filename, &filename_size) < 0) {
-     // filename is not a string.
-     PyErr_Clear();
--    return false;
-+    goto exit;
-   }
-   if ((filename_size < 3) ||
-       (strcmp(&filename[filename_size - 3], ".py") != 0)) {
-     // Cython's stack does not have .py file name and is not at global module
-     // scope.
--    return true;
-+    result = true;
-+    goto exit;
-   }
-   if (filename_size < 7) {
-     // filename is too short.
--    return false;
-+    goto exit;
-   }
-   if (strcmp(&filename[filename_size - 7], "_pb2.py") != 0) {
-     // Filename is not ending with _pb2.
--    return false;
-+    goto exit;
-   }
- 
--  if (frame->f_globals != frame->f_locals) {
-+  frame_globals = PyFrame_GetGlobals(frame);
-+  frame_locals = PyFrame_GetLocals(frame);
-+  if (frame_globals != frame_locals) {
-     // Not at global module scope
--    return false;
-+    goto exit;
-   }
- #endif
--  return true;
-+  result = true;
-+exit:
-+  Py_XDECREF(frame_globals);
-+  Py_XDECREF(frame_locals);
-+  Py_XDECREF(frame_code);
-+  Py_XDECREF(frame);
-+  return result;
- }
- 
- // If the calling code is not a _pb2.py file, raise AttributeError.
\ No newline at end of file
diff --git a/third_party/xla/third_party/pybind11.BUILD b/third_party/xla/third_party/pybind11.BUILD
deleted file mode 100644
index d2a5d102559be0..00000000000000
--- a/third_party/xla/third_party/pybind11.BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    strip_include_prefix = "include",
-    deps = [
-        "@local_xla//third_party/python_runtime:headers",
-    ],
-)
-
-# Needed by pybind11_bazel.
-config_setting(
-    name = "osx",
-    constraint_values = ["@platforms//os:osx"],
-)
diff --git a/third_party/xla/third_party/pybind11_abseil/BUILD b/third_party/xla/third_party/pybind11_abseil/BUILD
deleted file mode 100644
index 3b946e563d4e30..00000000000000
--- a/third_party/xla/third_party/pybind11_abseil/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Necessary for bazel to recognize this as a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/pybind11_abseil/remove_license.patch b/third_party/xla/third_party/pybind11_abseil/remove_license.patch
deleted file mode 100644
index 91852c2f398bf5..00000000000000
--- a/third_party/xla/third_party/pybind11_abseil/remove_license.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/pybind11_abseil/BUILD b/pybind11_abseil/BUILD
-index 41482e7..ed9e4af 100644
---- a/pybind11_abseil/BUILD
-+++ b/pybind11_abseil/BUILD
-@@ -6,8 +6,6 @@ package(default_visibility = ["//visibility:public"])
-
- licenses(["notice"])
-
--exports_files(["LICENSE"])
--
- pybind_library(
-     name = "absl_casters",
-     hdrs = ["absl_casters.h"],
\ No newline at end of file
diff --git a/third_party/xla/third_party/pybind11_abseil/workspace.bzl b/third_party/xla/third_party/pybind11_abseil/workspace.bzl
deleted file mode 100644
index 19c11118b8fdb7..00000000000000
--- a/third_party/xla/third_party/pybind11_abseil/workspace.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Provides the repo macro to import pybind11_abseil.
-
-pybind11_abseil requires pybind11 (which is loaded in another rule) and pybind11_bazel.
-See https://github.com/pybind/pybind11_abseil#installation.
-"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports pybind11_abseil."""
-    PA_COMMIT = "2c4932ed6f6204f1656e245838f4f5eae69d2e29"
-    PA_SHA256 = "0223b647b8cc817336a51e787980ebc299c8d5e64c069829bf34b69d72337449"
-    tf_http_archive(
-        name = "pybind11_abseil",
-        sha256 = PA_SHA256,
-        strip_prefix = "pybind11_abseil-{commit}".format(commit = PA_COMMIT),
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11_abseil/archive/{commit}.tar.gz".format(commit = PA_COMMIT)),
-        build_file = "//third_party/pybind11_abseil:BUILD",
-        patch_file = ["//third_party/pybind11_abseil:remove_license.patch"],
-    )
diff --git a/third_party/xla/third_party/pybind11_bazel/BUILD b/third_party/xla/third_party/pybind11_bazel/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/pybind11_bazel/pybind11_bazel.patch b/third_party/xla/third_party/pybind11_bazel/pybind11_bazel.patch
deleted file mode 100644
index 74e038ddc113e9..00000000000000
--- a/third_party/xla/third_party/pybind11_bazel/pybind11_bazel.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-diff --git a/build_defs.bzl b/build_defs.bzl
-index cde1e93..03f14a5 100644
---- a/build_defs.bzl
-+++ b/build_defs.bzl
-@@ -27,7 +27,9 @@ PYBIND_DEPS = [
- 
- # Builds a Python extension module using pybind11.
- # This can be directly used in python with the import statement.
--# This adds rules for a .so binary file.
-+# This adds rules for .so and .pyd binary files, as well as
-+# a base target that selects between them depending on the platform
-+# (.pyd for windows, .so otherwise).
- def pybind_extension(
-         name,
-         copts = [],
-@@ -59,6 +61,21 @@ def pybind_extension(
-         **kwargs
-     )
- 
-+    native.genrule(
-+        name = name + "_pyd",
-+        srcs = [name + ".so"],
-+        outs = [name + ".pyd"],
-+        cmd = "cp $< $@",
-+    )
-+
-+    native.py_library(
-+        name = name,
-+        data = select({
-+            "@platforms//os:windows": [":" + name + ".pyd"],
-+            "//conditions:default": [":" + name + ".so"],
-+        }),
-+    )
-+
- # Builds a pybind11 compatible library. This can be linked to a pybind_extension.
- def pybind_library(
-         name,
diff --git a/third_party/xla/third_party/pybind11_bazel/workspace.bzl b/third_party/xla/third_party/pybind11_bazel/workspace.bzl
deleted file mode 100644
index dcc71d322287d7..00000000000000
--- a/third_party/xla/third_party/pybind11_bazel/workspace.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Provides the repo macro to import pybind11_bazel.
-
-pybind11_bazel requires pybind11 (which is loaded in another rule).
-"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    PB_COMMIT = "72cbbf1fbc830e487e3012862b7b720001b70672"
-    PB_SHA256 = "516c1b3a10d87740d2b7de6f121f8e19dde2c372ecbfe59aef44cd1872c10395"
-    tf_http_archive(
-        name = "pybind11_bazel",
-        strip_prefix = "pybind11_bazel-{commit}".format(commit = PB_COMMIT),
-        sha256 = PB_SHA256,
-        patch_file = ["//third_party/pybind11_bazel:pybind11_bazel.patch"],
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11_bazel/archive/{commit}.tar.gz".format(commit = PB_COMMIT)),
-    )
diff --git a/third_party/xla/third_party/remote_config/BUILD b/third_party/xla/third_party/remote_config/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/remote_config/BUILD.tpl b/third_party/xla/third_party/remote_config/BUILD.tpl
deleted file mode 100644
index d97eb970137672..00000000000000
--- a/third_party/xla/third_party/remote_config/BUILD.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-# Each platform creates a constraint @<platform>//:platform_constraint that
-# is listed in its constraint_values; rule that want to select a specific
-# platform to run on can put @<platform>//:platform_constraing into their
-# exec_compatible_with attribute.
-# Toolchains can similarly be marked with target_compatible_with or
-# exec_compatible_with to bind them to this platform.
-constraint_setting(
-    name = "platform_setting"
-)
-
-constraint_value(
-    name = "platform_constraint",
-    constraint_setting = ":platform_setting",
-    visibility = ["//visibility:public"],
-)
-
-platform(
-    name = "platform",
-    visibility = ["//visibility:public"],
-    constraint_values = [
-        "@platforms//cpu:%{cpu}",
-        "@platforms//os:%{platform}",
-        ":platform_constraint",
-    ],
-    exec_properties = %{exec_properties},
-)
diff --git a/third_party/xla/third_party/remote_config/common.bzl b/third_party/xla/third_party/remote_config/common.bzl
deleted file mode 100644
index 57fb6fcf7aca9a..00000000000000
--- a/third_party/xla/third_party/remote_config/common.bzl
+++ /dev/null
@@ -1,327 +0,0 @@
-"""Functions common across configure rules."""
-
-BAZEL_SH = "BAZEL_SH"
-PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
-TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
-
-def auto_config_fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg))
-
-def which(repository_ctx, program_name, allow_failure = False):
-    """Returns the full path to a program on the execution platform.
-
-    Args:
-      repository_ctx: the repository_ctx
-      program_name: name of the program on the PATH
-
-    Returns:
-      The full path to a program on the execution platform.
-    """
-    if is_windows(repository_ctx):
-        if not program_name.endswith(".exe"):
-            program_name = program_name + ".exe"
-        out = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\where.exe", program_name],
-            allow_failure = allow_failure,
-        ).stdout
-        if out != None:
-            out = out.replace("\\", "\\\\").rstrip()
-        return out
-
-    out = execute(
-        repository_ctx,
-        ["which", program_name],
-        allow_failure = allow_failure,
-    ).stdout
-    if out != None:
-        out = out.replace("\\", "\\\\").rstrip()
-    return out
-
-def get_python_bin(repository_ctx):
-    """Gets the python bin path.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      The python bin path.
-    """
-    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
-    if python_bin:
-        return python_bin
-
-    # First check for an explicit "python3"
-    python_bin = which(repository_ctx, "python3", True)
-    if python_bin:
-        return python_bin
-
-    # Some systems just call pythone3 "python"
-    python_bin = which(repository_ctx, "python", True)
-    if python_bin:
-        return python_bin
-
-    auto_config_fail("Cannot find python in PATH, please make sure " +
-                     "python is installed and add its directory in PATH, or --define " +
-                     "%s='/something/else'.\nPATH=%s" % (
-                         PYTHON_BIN_PATH,
-                         get_environ(repository_ctx, "PATH"),
-                     ))
-    return python_bin  # unreachable
-
-def get_bash_bin(repository_ctx):
-    """Gets the bash bin path.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      The bash bin path.
-    """
-    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)
-    if bash_bin != None:
-        return bash_bin
-    bash_bin_path = which(repository_ctx, "bash")
-    if bash_bin_path == None:
-        auto_config_fail("Cannot find bash in PATH, please make sure " +
-                         "bash is installed and add its directory in PATH, or --define " +
-                         "%s='/path/to/bash'.\nPATH=%s" % (
-                             BAZEL_SH,
-                             get_environ(repository_ctx, "PATH"),
-                         ))
-    return bash_bin_path
-
-def read_dir(repository_ctx, src_dir):
-    """Returns a sorted list with all files in a directory.
-
-    Finds all files inside a directory, traversing subfolders and following
-    symlinks.
-
-    Args:
-      repository_ctx: the repository_ctx
-      src_dir: the directory to traverse
-
-    Returns:
-      A sorted list with all files in a directory.
-    """
-    if is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            allow_failure = True,
-        )
-
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            allow_failure = True,
-        )
-        result = find_result.stdout
-    return sorted(result.splitlines())
-
-def get_environ(repository_ctx, name, default_value = None):
-    """Returns the value of an environment variable on the execution platform.
-
-    Args:
-      repository_ctx: the repository_ctx
-      name: the name of environment variable
-      default_value: the value to return if not set
-
-    Returns:
-      The value of the environment variable 'name' on the execution platform
-      or 'default_value' if it's not set.
-    """
-    if is_windows(repository_ctx):
-        result = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\cmd.exe", "/c", "echo", "%" + name + "%"],
-            allow_failure = True,
-        )
-    else:
-        cmd = "echo -n \"$%s\"" % name
-        result = execute(
-            repository_ctx,
-            [get_bash_bin(repository_ctx), "-c", cmd],
-            allow_failure = True,
-        )
-    if len(result.stdout) == 0:
-        return default_value
-    return result.stdout
-
-def get_host_environ(repository_ctx, name, default_value = None):
-    """Returns the value of an environment variable on the host platform.
-
-    The host platform is the machine that Bazel runs on.
-
-    Args:
-      repository_ctx: the repository_ctx
-      name: the name of environment variable
-
-    Returns:
-      The value of the environment variable 'name' on the host platform.
-    """
-    if name in repository_ctx.os.environ:
-        return repository_ctx.os.environ.get(name).strip()
-
-    if hasattr(repository_ctx.attr, "environ") and name in repository_ctx.attr.environ:
-        return repository_ctx.attr.environ.get(name).strip()
-
-    return default_value
-
-def is_windows(repository_ctx):
-    """Returns true if the execution platform is Windows.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      If the execution platform is Windows.
-    """
-    os_name = ""
-    if hasattr(repository_ctx.attr, "exec_properties") and "OSFamily" in repository_ctx.attr.exec_properties:
-        os_name = repository_ctx.attr.exec_properties["OSFamily"]
-    else:
-        os_name = repository_ctx.os.name
-
-    return os_name.lower().find("windows") != -1
-
-def get_cpu_value(repository_ctx):
-    """Returns the name of the host operating system.
-
-    Args:
-      repository_ctx: The repository context.
-    Returns:
-      A string containing the name of the host operating system.
-    """
-    if is_windows(repository_ctx):
-        return "Windows"
-    result = raw_exec(repository_ctx, ["uname", "-s"])
-    return result.stdout.strip()
-
-def execute(
-        repository_ctx,
-        cmdline,
-        error_msg = None,
-        error_details = None,
-        allow_failure = False):
-    """Executes an arbitrary shell command.
-
-    Args:
-      repository_ctx: the repository_ctx object
-      cmdline: list of strings, the command to execute
-      error_msg: string, a summary of the error if the command fails
-      error_details: string, details about the error or steps to fix it
-      allow_failure: bool, if True, an empty stdout result or output to stderr
-        is fine, otherwise either of these is an error
-    Returns:
-      The result of repository_ctx.execute(cmdline)
-    """
-    result = raw_exec(repository_ctx, cmdline)
-    if (result.stderr or not result.stdout) and not allow_failure:
-        fail(
-            "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
-                result.stderr.strip(),
-                error_details if error_details else "",
-            ]),
-        )
-    return result
-
-def raw_exec(repository_ctx, cmdline):
-    """Executes a command via repository_ctx.execute() and returns the result.
-
-    This method is useful for debugging purposes. For example, to print all
-    commands executed as well as their return code.
-
-    Args:
-      repository_ctx: the repository_ctx
-      cmdline: the list of args
-
-    Returns:
-      The 'exec_result' of repository_ctx.execute().
-    """
-    return repository_ctx.execute(cmdline)
-
-def files_exist(repository_ctx, paths, bash_bin = None):
-    """Checks which files in paths exists.
-
-    Args:
-      repository_ctx: the repository_ctx
-      paths: a list of paths
-      bash_bin: path to the bash interpreter
-
-    Returns:
-      Returns a list of Bool. True means that the path at the
-      same position in the paths list exists.
-    """
-    if bash_bin == None:
-        bash_bin = get_bash_bin(repository_ctx)
-
-    cmd_tpl = "[ -e \"%s\" ] && echo True || echo False"
-    cmds = [cmd_tpl % path for path in paths]
-    cmd = " ; ".join(cmds)
-
-    stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip()
-    return [val == "True" for val in stdout.splitlines()]
-
-def realpath(repository_ctx, path, bash_bin = None):
-    """Returns the result of "realpath path".
-
-    Args:
-      repository_ctx: the repository_ctx
-      path: a path on the file system
-      bash_bin: path to the bash interpreter
-
-    Returns:
-      Returns the result of "realpath path"
-    """
-    if bash_bin == None:
-        bash_bin = get_bash_bin(repository_ctx)
-
-    return execute(repository_ctx, [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()
-
-def err_out(result):
-    """Returns stderr if set, else stdout.
-
-    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead
-    of using result.stderr use err_out(result) instead.
-
-    Args:
-      result: the exec_result.
-
-    Returns:
-      The stderr if set, else stdout
-    """
-    if len(result.stderr) == 0:
-        return result.stdout
-    return result.stderr
-
-def config_repo_label(config_repo, target):
-    """Construct a label from config_repo and target.
-
-    This function exists to ease the migration from preconfig to remote config. In preconfig
-    the TF_*_CONFIG_REPO environ variables are set to packages in the main repo while in
-    remote config they will point to remote repositories.
-
-    Args:
-      config_repo: a remote repository or package.
-      target: a target
-    Returns:
-      A label constructed from config_repo and target.
-    """
-    if config_repo.startswith("@") and not config_repo.find("//") > 0:
-        # remote config is being used.
-        return Label(config_repo + "//" + target)
-    elif target.startswith(":"):
-        return Label(config_repo + target)
-    else:
-        return Label(config_repo + "/" + target)
diff --git a/third_party/xla/third_party/remote_config/remote_platform_configure.bzl b/third_party/xla/third_party/remote_config/remote_platform_configure.bzl
deleted file mode 100644
index 59bedfe95de0c1..00000000000000
--- a/third_party/xla/third_party/remote_config/remote_platform_configure.bzl
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Repository rule to create a platform for a docker image to be used with RBE."""
-
-def _remote_platform_configure_impl(repository_ctx):
-    platform = repository_ctx.attr.platform
-    if platform == "local":
-        os = repository_ctx.os.name.lower()
-        if os.startswith("windows"):
-            platform = "windows"
-        elif os.startswith("mac os"):
-            platform = "osx"
-        else:
-            platform = "linux"
-
-    cpu = "x86_64"
-    machine_type = repository_ctx.execute(["bash", "-c", "echo $MACHTYPE"]).stdout
-    if (machine_type.startswith("ppc") or
-        machine_type.startswith("powerpc")):
-        cpu = "ppc"
-    elif machine_type.startswith("s390x"):
-        cpu = "s390x"
-    elif machine_type.startswith("aarch64"):
-        cpu = "aarch64"
-    elif machine_type.startswith("arm64"):
-        cpu = "aarch64"
-    elif machine_type.startswith("arm"):
-        cpu = "arm"
-    elif machine_type.startswith("mips64"):
-        cpu = "mips64"
-    elif machine_type.startswith("riscv64"):
-        cpu = "riscv64"
-
-    exec_properties = repository_ctx.attr.platform_exec_properties
-
-    serialized_exec_properties = "{"
-    for k, v in exec_properties.items():
-        serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v)
-    serialized_exec_properties += "}"
-
-    repository_ctx.template(
-        "BUILD",
-        Label("@local_xla//third_party/remote_config:BUILD.tpl"),
-        {
-            "%{platform}": platform,
-            "%{exec_properties}": serialized_exec_properties,
-            "%{cpu}": cpu,
-        },
-    )
-
-remote_platform_configure = repository_rule(
-    implementation = _remote_platform_configure_impl,
-    attrs = {
-        "platform_exec_properties": attr.string_dict(mandatory = True),
-        "platform": attr.string(default = "linux", values = ["linux", "windows", "local"]),
-    },
-)
diff --git a/third_party/xla/third_party/six.BUILD b/third_party/xla/third_party/six.BUILD
deleted file mode 100644
index d6ac1420e305c4..00000000000000
--- a/third_party/xla/third_party/six.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-#   Six provides simple utilities for wrapping over differences between Python 2
-#   and Python 3.
-
-licenses(["notice"])  # MIT
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "six",
-    srcs = ["six.py"],
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/snappy.BUILD b/third_party/xla/third_party/snappy.BUILD
deleted file mode 100644
index fbf4e2eae9ef1b..00000000000000
--- a/third_party/xla/third_party/snappy.BUILD
+++ /dev/null
@@ -1,99 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD 3-Clause
-
-exports_files(["COPYING"])
-
-cc_library(
-    name = "snappy",
-    srcs = [
-        "config.h",
-        "snappy.cc",
-        "snappy.h",
-        "snappy-internal.h",
-        "snappy-sinksource.cc",
-        "snappy-sinksource.h",
-        "snappy-stubs-internal.cc",
-        "snappy-stubs-internal.h",
-        "snappy-stubs-public.h",
-    ],
-    hdrs = ["snappy.h"],
-    copts = ["-DHAVE_CONFIG_H"] + select({
-        "@local_tsl//tsl:windows": [],
-        "//conditions:default": [
-            "-fno-exceptions",
-            "-Wno-sign-compare",
-            "-Wno-shift-negative-value",
-            "-Wno-implicit-function-declaration",
-        ],
-    }),
-    defines = select({
-        "@local_tsl//tsl:windows": [],
-        "//conditions:default": ["HAVE_SYS_UIO_H"],
-    }),
-)
-
-genrule(
-    name = "config_h",
-    outs = ["config.h"],
-    cmd = "\n".join([
-        "cat <<'EOF' >$@",
-        "#define HAVE_STDDEF_H 1",
-        "#define HAVE_STDINT_H 1",
-        "",
-        "#ifdef __has_builtin",
-        "#  if !defined(HAVE_BUILTIN_EXPECT) && __has_builtin(__builtin_expect)",
-        "#    define HAVE_BUILTIN_EXPECT 1",
-        "#  endif",
-        "#  if !defined(HAVE_BUILTIN_CTZ) && __has_builtin(__builtin_ctzll)",
-        "#    define HAVE_BUILTIN_CTZ 1",
-        "#  endif",
-        "#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4)",
-        "#  ifndef HAVE_BUILTIN_EXPECT",
-        "#    define HAVE_BUILTIN_EXPECT 1",
-        "#  endif",
-        "#  ifndef HAVE_BUILTIN_CTZ",
-        "#    define HAVE_BUILTIN_CTZ 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#ifdef __has_include",
-        "#  if !defined(HAVE_BYTESWAP_H) && __has_include(<byteswap.h>)",
-        "#    define HAVE_BYTESWAP_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_UNISTD_H) && __has_include(<unistd.h>)",
-        "#    define HAVE_UNISTD_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_ENDIAN_H) && __has_include(<sys/endian.h>)",
-        "#    define HAVE_SYS_ENDIAN_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_MMAN_H) && __has_include(<sys/mman.h>)",
-        "#    define HAVE_SYS_MMAN_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_UIO_H) && __has_include(<sys/uio.h>)",
-        "#    define HAVE_SYS_UIO_H 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#ifndef SNAPPY_IS_BIG_ENDIAN",
-        "#  ifdef __s390x__",
-        "#    define SNAPPY_IS_BIG_ENDIAN 1",
-        "#  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__",
-        "#    define SNAPPY_IS_BIG_ENDIAN 1",
-        "#  endif",
-        "#endif",
-        "EOF",
-    ]),
-)
-
-genrule(
-    name = "snappy_stubs_public_h",
-    srcs = ["snappy-stubs-public.h.in"],
-    outs = ["snappy-stubs-public.h"],
-    cmd = ("sed " +
-           "-e 's/$${\\(.*\\)_01}/\\1/g' " +
-           "-e 's/$${SNAPPY_MAJOR}/1/g' " +
-           "-e 's/$${SNAPPY_MINOR}/1/g' " +
-           "-e 's/$${SNAPPY_PATCHLEVEL}/4/g' " +
-           "$< >$@"),
-)
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 85792ae3fc8702..820d2dea0fb42f 100644
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -980,6 +980,25 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +}  // namespace mlir
 +
 +#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -1543,6 +1543,7 @@
+     p << " across dimensions = [";
+     llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
+     p << "]";
++    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
+     p << " : ";
+     p.printFunctionalType(*this);
+   } else {
+@@ -1705,6 +1706,7 @@
+   if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
+       parser.parseEqual() ||
+       parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
++      parser.parseOptionalAttrDict(result.attributes) ||
+       parser.parseColon() || parser.parseType(reduceOpFnType) ||
+       parser.parseOptionalLocationSpecifier(explicitLoc))
+     return failure();
 diff --ruN a/stablehlo/stablehlo/tests/infer_stablehlo.mlir b/stablehlo/stablehlo/tests/infer_stablehlo.mlir
 --- stablehlo/stablehlo/tests/infer_stablehlo.mlir
 +++ stablehlo/stablehlo/tests/infer_stablehlo.mlir
@@ -1078,6 +1097,25 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo.mlir b/stablehlo/stablehlo/
  }>
  
  func.func @is_compatible_sparse_mix_non_sparse(%arg0: tensor<1xf32>, %arg1: tensor<1xf32, #SV>) {
+diff --ruN a/stablehlo/stablehlo/tests/print_reduce.mlir b/stablehlo/stablehlo/tests/print_reduce.mlir
+--- stablehlo/stablehlo/tests/print_reduce.mlir
++++ stablehlo/stablehlo/tests/print_reduce.mlir
+@@ -168,3 +168,15 @@
+ 
+   func.return %0: tensor<4xf32>
+ }
++
++// The test case makes sure any custom attrs set on the reduce-op are
++// printed/parsed when pretty-printed.
++
++// CHECK-LABEL:  func @pretty_print_with_custom_attr
++// CHECK:          applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64}
++
++func.func @pretty_print_with_custom_attr(%arg0: tensor<2x64x13xf32>) -> tensor<2x13xf32> {
++  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
++  %1 = stablehlo.reduce(%arg0 init: %0) applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64} : (tensor<2x64x13xf32>, tensor<f32>) -> tensor<2x13xf32>
++  return %1 : tensor<2x13xf32>
++}
 diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 --- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
 +++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
diff --git a/third_party/xla/third_party/systemlibs/BUILD b/third_party/xla/third_party/systemlibs/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/systemlibs/BUILD.tpl b/third_party/xla/third_party/systemlibs/BUILD.tpl
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/systemlibs/absl_py.BUILD b/third_party/xla/third_party/systemlibs/absl_py.BUILD
deleted file mode 100644
index cbe6e10ba5be94..00000000000000
--- a/third_party/xla/third_party/systemlibs/absl_py.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/absl_py.absl.BUILD b/third_party/xla/third_party/systemlibs/absl_py.absl.BUILD
deleted file mode 100644
index c9a67906495c97..00000000000000
--- a/third_party/xla/third_party/systemlibs/absl_py.absl.BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-py_library(
-    name = "app",
-)
diff --git a/third_party/xla/third_party/systemlibs/absl_py.absl.flags.BUILD b/third_party/xla/third_party/systemlibs/absl_py.absl.flags.BUILD
deleted file mode 100644
index d92f4949df1a5b..00000000000000
--- a/third_party/xla/third_party/systemlibs/absl_py.absl.flags.BUILD
+++ /dev/null
@@ -1,11 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-py_library(
-    name = "flags",
-)
-
-py_library(
-    name = "argparse_flags",
-)
diff --git a/third_party/xla/third_party/systemlibs/absl_py.absl.logging.BUILD b/third_party/xla/third_party/systemlibs/absl_py.absl.logging.BUILD
deleted file mode 100644
index 98d136ebe8dbf1..00000000000000
--- a/third_party/xla/third_party/systemlibs/absl_py.absl.logging.BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-py_library(
-    name = "logging",
-)
diff --git a/third_party/xla/third_party/systemlibs/absl_py.absl.testing.BUILD b/third_party/xla/third_party/systemlibs/absl_py.absl.testing.BUILD
deleted file mode 100644
index ee810f8f2103c5..00000000000000
--- a/third_party/xla/third_party/systemlibs/absl_py.absl.testing.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-py_library(
-    name = "parameterized",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "absltest",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "flagsaver",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/boringssl.BUILD b/third_party/xla/third_party/systemlibs/boringssl.BUILD
deleted file mode 100644
index bc4c533403efac..00000000000000
--- a/third_party/xla/third_party/systemlibs/boringssl.BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-licenses(["notice"])
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "crypto",
-    linkopts = ["-lcrypto"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "ssl",
-    linkopts = ["-lssl"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":crypto",
-    ],
-)
diff --git a/third_party/xla/third_party/systemlibs/build_defs.bzl.tpl b/third_party/xla/third_party/systemlibs/build_defs.bzl.tpl
deleted file mode 100644
index 3faa46c581418c..00000000000000
--- a/third_party/xla/third_party/systemlibs/build_defs.bzl.tpl
+++ /dev/null
@@ -1,32 +0,0 @@
-# -*- Python -*-
-"""Skylark macros for system libraries.
-"""
-
-SYSTEM_LIBS_ENABLED = %{syslibs_enabled}
-
-SYSTEM_LIBS_LIST = [
-%{syslibs_list}
-]
-
-
-def if_any_system_libs(a, b=[]):
-  """Conditional which evaluates to 'a' if any system libraries are configured."""
-  if SYSTEM_LIBS_ENABLED:
-    return a
-  else:
-    return b
-
-
-def if_system_lib(lib, a, b=[]):
-  """Conditional which evaluates to 'a' if we're using the system version of lib"""
-
-  if SYSTEM_LIBS_ENABLED and lib in SYSTEM_LIBS_LIST:
-    return a
-  else:
-    return b
-
-
-def if_not_system_lib(lib, a, b=[]):
-  """Conditional which evaluates to 'a' if we're using the system version of lib"""
-
-  return if_system_lib(lib, b, a)
diff --git a/third_party/xla/third_party/systemlibs/curl.BUILD b/third_party/xla/third_party/systemlibs/curl.BUILD
deleted file mode 100644
index c5f125caa9eb46..00000000000000
--- a/third_party/xla/third_party/systemlibs/curl.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # MIT/X derivative license
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "curl",
-    linkopts = ["-lcurl"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/cython.BUILD b/third_party/xla/third_party/systemlibs/cython.BUILD
deleted file mode 100644
index 1d525876765a2c..00000000000000
--- a/third_party/xla/third_party/systemlibs/cython.BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["notice"])  # Apache-2.0
-
-genrule(
-    name = "lncython",
-    outs = ["cython"],
-    cmd = "ln -s $$(which cython) $@",
-)
-
-sh_binary(
-    name = "cython_binary",
-    srcs = ["cython"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/double_conversion.BUILD b/third_party/xla/third_party/systemlibs/double_conversion.BUILD
deleted file mode 100644
index 568460181ae0bc..00000000000000
--- a/third_party/xla/third_party/systemlibs/double_conversion.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "double-conversion",
-    linkopts = ["-ldouble-conversion"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/gif.BUILD b/third_party/xla/third_party/systemlibs/gif.BUILD
deleted file mode 100644
index 5eb2c918ba443f..00000000000000
--- a/third_party/xla/third_party/systemlibs/gif.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # MIT
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gif",
-    linkopts = ["-lgif"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/google_cloud_cpp.BUILD b/third_party/xla/third_party/systemlibs/google_cloud_cpp.BUILD
deleted file mode 100644
index cbe6e10ba5be94..00000000000000
--- a/third_party/xla/third_party/systemlibs/google_cloud_cpp.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD b/third_party/xla/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
deleted file mode 100644
index b59d565390cc97..00000000000000
--- a/third_party/xla/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "bigtable_client",
-    linkopts = ["-lbigtable_client"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/grpc.BUILD b/third_party/xla/third_party/systemlibs/grpc.BUILD
deleted file mode 100644
index 8b703f11556daa..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-licenses(["notice"])  # Apache v2
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc",
-    linkopts = [
-        "-lgrpc",
-        "-lgpr",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc++",
-    linkopts = [
-        "-lgrpc++",
-        "-lgpr",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc++_public_hdrs",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc++_codegen_proto",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc_unsecure",
-    linkopts = [
-        "-lgrpc_unsecure",
-        "-lgpr",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "grpc++_unsecure",
-    linkopts = [
-        "-lgrpc++_unsecure",
-        "-lgpr",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "ln_grpc_cpp_plugin",
-    outs = ["grpc_cpp_plugin.bin"],
-    cmd = "ln -s $$(which grpc_cpp_plugin) $@",
-)
-
-sh_binary(
-    name = "grpc_cpp_plugin",
-    srcs = ["grpc_cpp_plugin.bin"],
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "ln_grpc_python_plugin",
-    outs = ["grpc_python_plugin.bin"],
-    cmd = "ln -s $$(which grpc_python_plugin) $@",
-)
-
-sh_binary(
-    name = "grpc_python_plugin",
-    srcs = ["grpc_python_plugin.bin"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl b/third_party/xla/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl
deleted file mode 100644
index e427328c39be80..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.bazel.cc_grpc_library.bzl
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Generates and compiles C++ grpc stubs from proto_library rules."""
-
-load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
-load("@com_github_grpc_grpc//bazel:protobuf.bzl", "well_known_proto_libs")
-
-def cc_grpc_library(
-        name,
-        srcs,
-        deps,
-        proto_only = False,
-        well_known_protos = False,
-        generate_mocks = False,
-        use_external = False,
-        grpc_only = False,
-        **kwargs):
-    """Generates C++ grpc classes for services defined in a proto file.
-
-    If grpc_only is True, this rule is compatible with proto_library and
-    cc_proto_library native rules such that it expects proto_library target
-    as srcs argument and generates only grpc library classes, expecting
-    protobuf messages classes library (cc_proto_library target) to be passed in
-    deps argument. By default grpc_only is False which makes this rule to behave
-    in a backwards-compatible mode (trying to generate both proto and grpc
-    classes).
-
-    Assumes the generated classes will be used in cc_api_version = 2.
-
-    Args:
-        name (str): Name of rule.
-        srcs (list): A single .proto file which contains services definitions,
-          or if grpc_only parameter is True, a single proto_library which
-          contains services descriptors.
-        deps (list): A list of C++ proto_library (or cc_proto_library) which
-          provides the compiled code of any message that the services depend on.
-        proto_only (bool): If True, create only C++ proto classes library,
-          avoid creating C++ grpc classes library (expect it in deps).
-          Deprecated, use native cc_proto_library instead. False by default.
-        well_known_protos (bool): Should this library additionally depend on
-          well known protos. Deprecated, the well known protos should be
-          specified as explicit dependencies of the proto_library target
-          (passed in srcs parameter) instead. False by default.
-        generate_mocks (bool): when True, Google Mock code for client stub is
-          generated. False by default.
-        use_external (bool): Not used.
-        grpc_only (bool): if True, generate only grpc library, expecting
-          protobuf messages library (cc_proto_library target) to be passed as
-          deps. False by default (will become True by default eventually).
-        **kwargs: rest of arguments, e.g., compatible_with and visibility
-    """
-    if len(srcs) > 1:
-        fail("Only one srcs value supported", "srcs")
-    if grpc_only and proto_only:
-        fail("A mutualy exclusive configuration is specified: grpc_only = True and proto_only = True")
-
-    extra_deps = []
-    proto_targets = []
-
-    if not grpc_only:
-        proto_target = "_" + name + "_only"
-        cc_proto_target = name if proto_only else "_" + name + "_cc_proto"
-
-        proto_deps = ["_" + dep + "_only" for dep in deps if dep.find(":") == -1]
-        proto_deps += [dep.split(":")[0] + ":" + "_" + dep.split(":")[1] + "_only" for dep in deps if dep.find(":") != -1]
-        if well_known_protos:
-            proto_deps += well_known_proto_libs()
-
-        native.proto_library(
-            name = proto_target,
-            srcs = srcs,
-            deps = proto_deps,
-            **kwargs
-        )
-
-        native.cc_proto_library(
-            name = cc_proto_target,
-            deps = [":" + proto_target],
-            **kwargs
-        )
-        extra_deps.append(":" + cc_proto_target)
-        proto_targets.append(proto_target)
-    else:
-        if not srcs:
-            fail("srcs cannot be empty", "srcs")
-        proto_targets += srcs
-
-    if not proto_only:
-        codegen_grpc_target = "_" + name + "_grpc_codegen"
-        generate_cc(
-            name = codegen_grpc_target,
-            srcs = proto_targets,
-            plugin = "@com_github_grpc_grpc//src/compiler:grpc_cpp_plugin",
-            well_known_protos = well_known_protos,
-            generate_mocks = generate_mocks,
-            **kwargs
-        )
-
-        native.cc_library(
-            name = name,
-            srcs = [":" + codegen_grpc_target],
-            hdrs = [":" + codegen_grpc_target],
-            deps = deps +
-                   extra_deps +
-                   ["@com_github_grpc_grpc//:grpc++_codegen_proto"],
-            **kwargs
-        )
diff --git a/third_party/xla/third_party/systemlibs/grpc.bazel.generate_cc.bzl b/third_party/xla/third_party/systemlibs/grpc.bazel.generate_cc.bzl
deleted file mode 100644
index 3c3f20c06ec5e6..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.bazel.generate_cc.bzl
+++ /dev/null
@@ -1,187 +0,0 @@
-"""Generates C++ grpc stubs from proto_library rules.
-
-This is an internal rule used by cc_grpc_library, and shouldn't be used
-directly.
-"""
-
-load(
-    "@com_github_grpc_grpc//bazel:protobuf.bzl",
-    "get_include_directory",
-    "get_plugin_args",
-    "get_proto_root",
-    "proto_path_to_generated_filename",
-)
-
-_GRPC_PROTO_HEADER_FMT = "{}.grpc.pb.h"
-_GRPC_PROTO_SRC_FMT = "{}.grpc.pb.cc"
-_GRPC_PROTO_MOCK_HEADER_FMT = "{}_mock.grpc.pb.h"
-_PROTO_HEADER_FMT = "{}.pb.h"
-_PROTO_SRC_FMT = "{}.pb.cc"
-
-def _strip_package_from_path(label_package, file):
-    prefix_len = 0
-    if not file.is_source and file.path.startswith(file.root.path):
-        prefix_len = len(file.root.path) + 1
-
-    path = file.path
-    if len(label_package) == 0:
-        return path
-    if not path.startswith(label_package + "/", prefix_len):
-        fail("'{}' does not lie within '{}'.".format(path, label_package))
-    return path[prefix_len + len(label_package + "/"):]
-
-def _get_srcs_file_path(file):
-    if not file.is_source and file.path.startswith(file.root.path):
-        return file.path[len(file.root.path) + 1:]
-    return file.path
-
-def _join_directories(directories):
-    massaged_directories = [directory for directory in directories if len(directory) != 0]
-    return "/".join(massaged_directories)
-
-def generate_cc_impl(ctx):
-    """Implementation of the generate_cc rule."""
-    protos = [f for src in ctx.attr.srcs for f in src[ProtoInfo].check_deps_sources.to_list()]
-    includes = [
-        f
-        for src in ctx.attr.srcs
-        for f in src[ProtoInfo].transitive_imports.to_list()
-    ]
-    outs = []
-    proto_root = get_proto_root(
-        ctx.label.workspace_root,
-    )
-
-    label_package = _join_directories([ctx.label.workspace_root, ctx.label.package])
-    if ctx.executable.plugin:
-        outs += [
-            proto_path_to_generated_filename(
-                _strip_package_from_path(label_package, proto),
-                _GRPC_PROTO_HEADER_FMT,
-            )
-            for proto in protos
-        ]
-        outs += [
-            proto_path_to_generated_filename(
-                _strip_package_from_path(label_package, proto),
-                _GRPC_PROTO_SRC_FMT,
-            )
-            for proto in protos
-        ]
-        if ctx.attr.generate_mocks:
-            outs += [
-                proto_path_to_generated_filename(
-                    _strip_package_from_path(label_package, proto),
-                    _GRPC_PROTO_MOCK_HEADER_FMT,
-                )
-                for proto in protos
-            ]
-    else:
-        outs += [
-            proto_path_to_generated_filename(
-                _strip_package_from_path(label_package, proto),
-                _PROTO_HEADER_FMT,
-            )
-            for proto in protos
-        ]
-        outs += [
-            proto_path_to_generated_filename(
-                _strip_package_from_path(label_package, proto),
-                _PROTO_SRC_FMT,
-            )
-            for proto in protos
-        ]
-    out_files = [ctx.actions.declare_file(out) for out in outs]
-    dir_out = str(ctx.genfiles_dir.path + proto_root)
-
-    arguments = []
-    if ctx.executable.plugin:
-        arguments += get_plugin_args(
-            ctx.executable.plugin,
-            ctx.attr.flags,
-            dir_out,
-            ctx.attr.generate_mocks,
-        )
-        tools = [ctx.executable.plugin]
-    else:
-        arguments += ["--cpp_out=" + ",".join(ctx.attr.flags) + ":" + dir_out]
-        tools = []
-
-    arguments += [
-        "--proto_path={}".format(get_include_directory(i))
-        for i in includes
-    ]
-
-    # Include the output directory so that protoc puts the generated code in the
-    # right directory.
-    arguments += ["--proto_path={0}{1}".format(dir_out, proto_root)]
-    arguments += [_get_srcs_file_path(proto) for proto in protos]
-
-    # create a list of well known proto files if the argument is non-None
-    well_known_proto_files = []
-    if ctx.attr.well_known_protos:
-        f = ctx.attr.well_known_protos.files.to_list()[0].dirname
-        if f != "external/com_google_protobuf/src/google/protobuf":
-            print(
-                "Error: Only @com_google_protobuf//:well_known_protos is supported",
-            )
-        else:
-            # f points to "external/com_google_protobuf/src/google/protobuf"
-            # add -I argument to protoc so it knows where to look for the proto files.
-            arguments += ["-I{0}".format(f + "/../..")]
-            well_known_proto_files = [
-                f
-                for f in ctx.attr.well_known_protos.files.to_list()
-            ]
-
-    ctx.actions.run(
-        inputs = protos + includes + well_known_proto_files,
-        tools = tools,
-        outputs = out_files,
-        executable = ctx.executable._protoc,
-        arguments = arguments,
-        use_default_shell_env = True,
-    )
-
-    return struct(files = depset(out_files))
-
-_generate_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_empty = False,
-            providers = [ProtoInfo],
-        ),
-        "plugin": attr.label(
-            executable = True,
-            providers = ["files_to_run"],
-            cfg = "exec",
-        ),
-        "flags": attr.string_list(
-            mandatory = False,
-            allow_empty = True,
-        ),
-        "well_known_protos": attr.label(mandatory = False),
-        "generate_mocks": attr.bool(
-            default = False,
-            mandatory = False,
-        ),
-        "_protoc": attr.label(
-            default = Label("@com_google_protobuf//:protoc"),
-            executable = True,
-            cfg = "exec",
-        ),
-    },
-    # We generate .h files, so we need to output to genfiles.
-    output_to_genfiles = True,
-    implementation = generate_cc_impl,
-)
-
-def generate_cc(well_known_protos, **kwargs):
-    if well_known_protos:
-        _generate_cc(
-            well_known_protos = "@com_google_protobuf//:well_known_protos",
-            **kwargs
-        )
-    else:
-        _generate_cc(**kwargs)
diff --git a/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_deps.bzl b/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_deps.bzl
deleted file mode 100644
index dd389c6855036c..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_deps.bzl
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Load dependencies needed to compile and test the grpc library as a 3rd-party consumer."""
-
-def grpc_deps():
-    """Loads dependencies need to compile and test the grpc library."""
-
-    pass
diff --git a/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl b/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl
deleted file mode 100644
index 631c93af04705d..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.bazel.grpc_extra_deps.bzl
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Stub version of @com_github_grpc_grpc//bazel:grpc_extra_deps.bzl necessary for TF system libs"""
-
-def grpc_extra_deps():
-    pass
diff --git a/third_party/xla/third_party/systemlibs/grpc.bazel.protobuf.bzl b/third_party/xla/third_party/systemlibs/grpc.bazel.protobuf.bzl
deleted file mode 100644
index 3eca97dc2311fb..00000000000000
--- a/third_party/xla/third_party/systemlibs/grpc.bazel.protobuf.bzl
+++ /dev/null
@@ -1,244 +0,0 @@
-"""Utility functions for generating protobuf code."""
-
-_PROTO_EXTENSION = ".proto"
-_VIRTUAL_IMPORTS = "/_virtual_imports/"
-
-def well_known_proto_libs():
-    return [
-        "@com_google_protobuf//:any_proto",
-        "@com_google_protobuf//:api_proto",
-        "@com_google_protobuf//:compiler_plugin_proto",
-        "@com_google_protobuf//:descriptor_proto",
-        "@com_google_protobuf//:duration_proto",
-        "@com_google_protobuf//:empty_proto",
-        "@com_google_protobuf//:field_mask_proto",
-        "@com_google_protobuf//:source_context_proto",
-        "@com_google_protobuf//:struct_proto",
-        "@com_google_protobuf//:timestamp_proto",
-        "@com_google_protobuf//:type_proto",
-        "@com_google_protobuf//:wrappers_proto",
-    ]
-
-def get_proto_root(workspace_root):
-    """Gets the root protobuf directory.
-
-    Args:
-      workspace_root: context.label.workspace_root
-
-    Returns:
-      The directory relative to which generated include paths should be.
-    """
-    if workspace_root:
-        return "/{}".format(workspace_root)
-    else:
-        return ""
-
-def _strip_proto_extension(proto_filename):
-    if not proto_filename.endswith(_PROTO_EXTENSION):
-        fail('"{}" does not end with "{}"'.format(
-            proto_filename,
-            _PROTO_EXTENSION,
-        ))
-    return proto_filename[:-len(_PROTO_EXTENSION)]
-
-def proto_path_to_generated_filename(proto_path, fmt_str):
-    """Calculates the name of a generated file for a protobuf path.
-
-    For example, "examples/protos/helloworld.proto" might map to
-      "helloworld.pb.h".
-
-    Args:
-      proto_path: The path to the .proto file.
-      fmt_str: A format string used to calculate the generated filename. For
-        example, "{}.pb.h" might be used to calculate a C++ header filename.
-
-    Returns:
-      The generated filename.
-    """
-    return fmt_str.format(_strip_proto_extension(proto_path))
-
-def get_include_directory(source_file):
-    """Returns the include directory path for the source_file.
-
-    I.e. all of the include statements within the given source_file
-    are calculated relative to the directory returned by this method.
-
-    The returned directory path can be used as the "--proto_path=" argument
-    value.
-
-    Args:
-      source_file: A proto file.
-
-    Returns:
-      The include directory path for the source_file.
-    """
-    directory = source_file.path
-    prefix_len = 0
-
-    if is_in_virtual_imports(source_file):
-        root, relative = source_file.path.split(_VIRTUAL_IMPORTS, 2)
-        result = root + _VIRTUAL_IMPORTS + relative.split("/", 1)[0]
-        return result
-
-    if not source_file.is_source and directory.startswith(source_file.root.path):
-        prefix_len = len(source_file.root.path) + 1
-
-    if directory.startswith("external", prefix_len):
-        external_separator = directory.find("/", prefix_len)
-        repository_separator = directory.find("/", external_separator + 1)
-        return directory[:repository_separator]
-    else:
-        return source_file.root.path if source_file.root.path else "."
-
-def get_plugin_args(
-        plugin,
-        flags,
-        dir_out,
-        generate_mocks,
-        plugin_name = "PLUGIN"):
-    """Returns arguments configuring protoc to use a plugin for a language.
-
-    Args:
-      plugin: An executable file to run as the protoc plugin.
-      flags: The plugin flags to be passed to protoc.
-      dir_out: The output directory for the plugin.
-      generate_mocks: A bool indicating whether to generate mocks.
-      plugin_name: A name of the plugin, it is required to be unique when there
-      are more than one plugin used in a single protoc command.
-    Returns:
-      A list of protoc arguments configuring the plugin.
-    """
-    augmented_flags = list(flags)
-    if generate_mocks:
-        augmented_flags.append("generate_mock_code=true")
-
-    augmented_dir_out = dir_out
-    if augmented_flags:
-        augmented_dir_out = ",".join(augmented_flags) + ":" + dir_out
-
-    return [
-        "--plugin=protoc-gen-{plugin_name}={plugin_path}".format(
-            plugin_name = plugin_name,
-            plugin_path = plugin.path,
-        ),
-        "--{plugin_name}_out={dir_out}".format(
-            plugin_name = plugin_name,
-            dir_out = augmented_dir_out,
-        ),
-    ]
-
-def _get_staged_proto_file(context, source_file):
-    if (source_file.dirname == context.label.package or
-        is_in_virtual_imports(source_file)):
-        return source_file
-    else:
-        copied_proto = context.actions.declare_file(source_file.basename)
-        context.actions.run_shell(
-            inputs = [source_file],
-            outputs = [copied_proto],
-            command = "cp {} {}".format(source_file.path, copied_proto.path),
-            mnemonic = "CopySourceProto",
-        )
-        return copied_proto
-
-def protos_from_context(context):
-    """Copies proto files to the appropriate location.
-
-    Args:
-      context: The ctx object for the rule.
-
-    Returns:
-      A list of the protos.
-    """
-    protos = []
-    for src in context.attr.deps:
-        for file in src[ProtoInfo].direct_sources:
-            protos.append(_get_staged_proto_file(context, file))
-    return protos
-
-def includes_from_deps(deps):
-    """Get includes from rule dependencies."""
-    return [
-        file
-        for src in deps
-        for file in src[ProtoInfo].transitive_imports.to_list()
-    ]
-
-def get_proto_arguments(protos, genfiles_dir_path):
-    """Get the protoc arguments specifying which protos to compile."""
-    arguments = []
-    for proto in protos:
-        strip_prefix_len = 0
-        if is_in_virtual_imports(proto):
-            incl_directory = get_include_directory(proto)
-            if proto.path.startswith(incl_directory):
-                strip_prefix_len = len(incl_directory) + 1
-        elif proto.path.startswith(genfiles_dir_path):
-            strip_prefix_len = len(genfiles_dir_path) + 1
-
-        arguments.append(proto.path[strip_prefix_len:])
-
-    return arguments
-
-def declare_out_files(protos, context, generated_file_format):
-    """Declares and returns the files to be generated."""
-
-    out_file_paths = []
-    for proto in protos:
-        if not is_in_virtual_imports(proto):
-            out_file_paths.append(proto.basename)
-        else:
-            path = proto.path[proto.path.index(_VIRTUAL_IMPORTS) + 1:]
-            out_file_paths.append(path)
-
-    return [
-        context.actions.declare_file(
-            proto_path_to_generated_filename(
-                out_file_path,
-                generated_file_format,
-            ),
-        )
-        for out_file_path in out_file_paths
-    ]
-
-def get_out_dir(protos, context):
-    """ Returns the calculated value for --<lang>_out= protoc argument based on
-    the input source proto files and current context.
-
-    Args:
-        protos: A list of protos to be used as source files in protoc command
-        context: A ctx object for the rule.
-    Returns:
-        The value of --<lang>_out= argument.
-    """
-    at_least_one_virtual = 0
-    for proto in protos:
-        if is_in_virtual_imports(proto):
-            at_least_one_virtual = True
-        elif at_least_one_virtual:
-            fail("Proto sources must be either all virtual imports or all real")
-    if at_least_one_virtual:
-        out_dir = get_include_directory(protos[0])
-        ws_root = protos[0].owner.workspace_root
-        if ws_root and out_dir.find(ws_root) >= 0:
-            out_dir = "".join(out_dir.rsplit(ws_root, 1))
-        return struct(
-            path = out_dir,
-            import_path = out_dir[out_dir.find(_VIRTUAL_IMPORTS) + 1:],
-        )
-    return struct(path = context.genfiles_dir.path, import_path = None)
-
-def is_in_virtual_imports(source_file, virtual_folder = _VIRTUAL_IMPORTS):
-    """Determines if source_file is virtual (is placed in _virtual_imports
-    subdirectory). The output of all proto_library targets which use
-    import_prefix  and/or strip_import_prefix arguments is placed under
-    _virtual_imports directory.
-
-    Args:
-        source_file: A proto file.
-        virtual_folder: The virtual folder name (is set to "_virtual_imports"
-            by default)
-    Returns:
-        True if source_file is located under _virtual_imports, False otherwise.
-    """
-    return not source_file.is_source and virtual_folder in source_file.path
diff --git a/third_party/xla/third_party/systemlibs/jsoncpp.BUILD b/third_party/xla/third_party/systemlibs/jsoncpp.BUILD
deleted file mode 100644
index b5951e3a3404f3..00000000000000
--- a/third_party/xla/third_party/systemlibs/jsoncpp.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["unencumbered"])  # Public Domain or MIT
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jsoncpp",
-    linkopts = ["-ljsoncpp"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/lmdb.BUILD b/third_party/xla/third_party/systemlibs/lmdb.BUILD
deleted file mode 100644
index 6177b095ec7aca..00000000000000
--- a/third_party/xla/third_party/systemlibs/lmdb.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # OpenLDAP Public License
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "lmdb",
-    linkopts = ["-llmdb"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/nsync.BUILD b/third_party/xla/third_party/systemlibs/nsync.BUILD
deleted file mode 100644
index c5d4ad0a7651c6..00000000000000
--- a/third_party/xla/third_party/systemlibs/nsync.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-licenses(["notice"])  # BSD 3-Clause
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nsync_headers",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nsync",
-    linkopts = ["-lnsync"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nsync_cpp",
-    linkopts = ["-lnsync_cpp"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/png.BUILD b/third_party/xla/third_party/systemlibs/png.BUILD
deleted file mode 100644
index fc6b6f2d8bb0f8..00000000000000
--- a/third_party/xla/third_party/systemlibs/png.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # BSD/MIT-like license
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "png",
-    linkopts = ["-lpng"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/protobuf.BUILD b/third_party/xla/third_party/systemlibs/protobuf.BUILD
deleted file mode 100644
index 4d05ab28d12e99..00000000000000
--- a/third_party/xla/third_party/systemlibs/protobuf.BUILD
+++ /dev/null
@@ -1,113 +0,0 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
-load(
-    "@com_google_protobuf//:protobuf.bzl",
-    "cc_proto_library",
-    "proto_gen",
-    "py_proto_library",
-)
-
-licenses(["notice"])
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-# Map of all well known protos.
-# name => (include path, imports)
-WELL_KNOWN_PROTO_MAP = {
-    "any": ("google/protobuf/any.proto", []),
-    "api": (
-        "google/protobuf/api.proto",
-        [
-            "source_context",
-            "type",
-        ],
-    ),
-    "compiler_plugin": (
-        "google/protobuf/compiler/plugin.proto",
-        ["descriptor"],
-    ),
-    "descriptor": ("google/protobuf/descriptor.proto", []),
-    "duration": ("google/protobuf/duration.proto", []),
-    "empty": ("google/protobuf/empty.proto", []),
-    "field_mask": ("google/protobuf/field_mask.proto", []),
-    "source_context": ("google/protobuf/source_context.proto", []),
-    "struct": ("google/protobuf/struct.proto", []),
-    "timestamp": ("google/protobuf/timestamp.proto", []),
-    "type": (
-        "google/protobuf/type.proto",
-        [
-            "any",
-            "source_context",
-        ],
-    ),
-    "wrappers": ("google/protobuf/wrappers.proto", []),
-}
-
-RELATIVE_WELL_KNOWN_PROTOS = [proto[1][0] for proto in WELL_KNOWN_PROTO_MAP.items()]
-
-genrule(
-    name = "link_proto_files",
-    outs = RELATIVE_WELL_KNOWN_PROTOS,
-    cmd = """
-      for i in $(OUTS); do
-        f=$${i#$(@D)/}
-        mkdir -p $(@D)/$${f%/*}
-        ln -sf $(PROTOBUF_INCLUDE_PATH)/$$f $(@D)/$$f
-      done
-    """,
-)
-
-cc_library(
-    name = "protobuf",
-    linkopts = ["-lprotobuf"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "protobuf_headers",
-    linkopts = ["-lprotobuf"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "protoc_lib",
-    linkopts = ["-lprotoc"],
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "protoc",
-    outs = ["protoc.bin"],
-    cmd = "ln -s $$(which protoc) $@",
-    executable = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_proto_library(
-    name = "cc_wkt_protos",
-    internal_bootstrap_hack = 1,
-    protoc = ":protoc",
-    visibility = ["//visibility:public"],
-)
-
-proto_gen(
-    name = "protobuf_python_genproto",
-    includes = ["."],
-    protoc = "@com_google_protobuf//:protoc",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "protobuf_python",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
-
-[proto_library(
-    name = proto[0] + "_proto",
-    srcs = [proto[1][0]],
-    visibility = ["//visibility:public"],
-    deps = [dep + "_proto" for dep in proto[1][1]],
-) for proto in WELL_KNOWN_PROTO_MAP.items()]
diff --git a/third_party/xla/third_party/systemlibs/protobuf.bzl b/third_party/xla/third_party/systemlibs/protobuf.bzl
deleted file mode 100644
index 66a06300e4f9ea..00000000000000
--- a/third_party/xla/third_party/systemlibs/protobuf.bzl
+++ /dev/null
@@ -1,430 +0,0 @@
-def _GetPath(ctx, path):
-    if ctx.label.workspace_root:
-        return ctx.label.workspace_root + "/" + path
-    else:
-        return path
-
-def _IsNewExternal(ctx):
-    # Bazel 0.4.4 and older have genfiles paths that look like:
-    #   bazel-out/local-fastbuild/genfiles/external/repo/foo
-    # After the exec root rearrangement, they look like:
-    #   ../repo/bazel-out/local-fastbuild/genfiles/foo
-    return ctx.label.workspace_root.startswith("../")
-
-def _GenDir(ctx):
-    if _IsNewExternal(ctx):
-        # We are using the fact that Bazel 0.4.4+ provides repository-relative paths
-        # for ctx.genfiles_dir.
-        return ctx.genfiles_dir.path + (
-            "/" + ctx.attr.includes[0] if ctx.attr.includes and ctx.attr.includes[0] else ""
-        )
-
-    # This means that we're either in the old version OR the new version in the local repo.
-    # Either way, appending the source path to the genfiles dir works.
-    return ctx.var["GENDIR"] + "/" + _SourceDir(ctx)
-
-def _SourceDir(ctx):
-    if not ctx.attr.includes:
-        return ctx.label.workspace_root
-    if not ctx.attr.includes[0]:
-        return _GetPath(ctx, ctx.label.package)
-    if not ctx.label.package:
-        return _GetPath(ctx, ctx.attr.includes[0])
-    return _GetPath(ctx, ctx.label.package + "/" + ctx.attr.includes[0])
-
-def _CcHdrs(srcs, use_grpc_plugin = False):
-    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
-    if use_grpc_plugin:
-        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
-    return ret
-
-def _CcSrcs(srcs, use_grpc_plugin = False):
-    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
-    if use_grpc_plugin:
-        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
-    return ret
-
-def _CcOuts(srcs, use_grpc_plugin = False):
-    return _CcHdrs(srcs, use_grpc_plugin) + _CcSrcs(srcs, use_grpc_plugin)
-
-def _PyOuts(srcs, use_grpc_plugin = False):
-    ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
-    if use_grpc_plugin:
-        ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
-    return ret
-
-def _RelativeOutputPath(path, include, dest = ""):
-    if include == None:
-        return path
-
-    if not path.startswith(include):
-        fail("Include path %s isn't part of the path %s." % (include, path))
-
-    if include and include[-1] != "/":
-        include = include + "/"
-    if dest and dest[-1] != "/":
-        dest = dest + "/"
-
-    path = path[len(include):]
-    return dest + path
-
-def _proto_gen_impl(ctx):
-    """General implementation for generating protos"""
-    srcs = ctx.files.srcs
-    deps = []
-    deps += ctx.files.srcs
-    source_dir = _SourceDir(ctx)
-    gen_dir = _GenDir(ctx)
-    if source_dir:
-        import_flags = ["-I" + source_dir, "-I" + gen_dir]
-    else:
-        import_flags = ["-I."]
-
-    for dep in ctx.attr.deps:
-        import_flags += dep.proto.import_flags
-        deps += dep.proto.deps
-    import_flags = depset(import_flags).to_list()
-    deps = depset(deps).to_list()
-
-    args = []
-    if ctx.attr.gen_cc:
-        args += ["--cpp_out=" + gen_dir]
-    if ctx.attr.gen_py:
-        args += ["--python_out=" + gen_dir]
-
-    inputs = srcs + deps
-    tools = [ctx.executable.protoc]
-    if ctx.executable.plugin:
-        plugin = ctx.executable.plugin
-        lang = ctx.attr.plugin_language
-        if not lang and plugin.basename.startswith("protoc-gen-"):
-            lang = plugin.basename[len("protoc-gen-"):]
-        if not lang:
-            fail("cannot infer the target language of plugin", "plugin_language")
-
-        outdir = gen_dir
-        if ctx.attr.plugin_options:
-            outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
-        args += ["--plugin=protoc-gen-%s=%s" % (lang, plugin.path)]
-        args += ["--%s_out=%s" % (lang, outdir)]
-        tools.append(plugin)
-
-    if args:
-        ctx.actions.run(
-            inputs = inputs,
-            outputs = ctx.outputs.outs,
-            arguments = args + import_flags + [s.path for s in srcs],
-            executable = ctx.executable.protoc,
-            mnemonic = "ProtoCompile",
-            tools = tools,
-            use_default_shell_env = True,
-        )
-
-    return struct(
-        proto = struct(
-            srcs = srcs,
-            import_flags = import_flags,
-            deps = deps,
-        ),
-    )
-
-proto_gen = rule(
-    attrs = {
-        "srcs": attr.label_list(allow_files = True),
-        "deps": attr.label_list(providers = ["proto"]),
-        "includes": attr.string_list(),
-        "protoc": attr.label(
-            cfg = "host",
-            executable = True,
-            allow_single_file = True,
-            mandatory = True,
-        ),
-        "plugin": attr.label(
-            cfg = "host",
-            allow_files = True,
-            executable = True,
-        ),
-        "plugin_language": attr.string(),
-        "plugin_options": attr.string_list(),
-        "gen_cc": attr.bool(),
-        "gen_py": attr.bool(),
-        "outs": attr.output_list(),
-    },
-    output_to_genfiles = True,
-    implementation = _proto_gen_impl,
-)
-"""Generates codes from Protocol Buffers definitions.
-
-This rule helps you to implement Skylark macros specific to the target
-language. You should prefer more specific `cc_proto_library `,
-`py_proto_library` and others unless you are adding such wrapper macros.
-
-Args:
-  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
-    against.
-  deps: a list of dependency labels; must be other proto libraries.
-  includes: a list of include paths to .proto files.
-  protoc: the label of the protocol compiler to generate the sources.
-  plugin: the label of the protocol compiler plugin to be passed to the protocol
-    compiler.
-  plugin_language: the language of the generated sources
-  plugin_options: a list of options to be passed to the plugin
-  gen_cc: generates C++ sources in addition to the ones from the plugin.
-  gen_py: generates Python sources in addition to the ones from the plugin.
-  outs: a list of labels of the expected outputs from the protocol compiler.
-"""
-
-def cc_proto_library(
-        name,
-        srcs = [],
-        deps = [],
-        cc_libs = [],
-        include = None,
-        protoc = "@com_google_protobuf//:protoc",
-        internal_bootstrap_hack = False,
-        use_grpc_plugin = False,
-        default_runtime = "@com_google_protobuf//:protobuf",
-        **kwargs):
-    """Bazel rule to create a C++ protobuf library from proto source files
-
-    NOTE: the rule is only an internal workaround to generate protos. The
-    interface may change and the rule may be removed when bazel has introduced
-    the native rule.
-
-    Args:
-      name: the name of the cc_proto_library.
-      srcs: the .proto files of the cc_proto_library.
-      deps: a list of dependency labels; must be cc_proto_library.
-      cc_libs: a list of other cc_library targets depended by the generated
-          cc_library.
-      include: a string indicating the include path of the .proto files.
-      protoc: the label of the protocol compiler to generate the sources.
-      internal_bootstrap_hack: a flag indicating if the cc_proto_library is used only
-          for bootstrapping. When it is set to True, no files will be generated.
-          The rule will simply be a provider for .proto files, so that other
-          cc_proto_library can depend on it.
-      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
-          when processing the proto files.
-      default_runtime: the implicitly default runtime which will be depended on by
-          the generated cc_library target.
-      **kwargs: other keyword arguments that are passed to cc_library.
-
-    """
-
-    includes = []
-    if include != None:
-        includes = [include]
-
-    if internal_bootstrap_hack:
-        # For pre-checked-in generated files, we add the internal_bootstrap_hack
-        # which will skip the codegen action.
-        proto_gen(
-            name = name + "_genproto",
-            srcs = srcs,
-            deps = [s + "_genproto" for s in deps],
-            includes = includes,
-            protoc = protoc,
-            visibility = ["//visibility:public"],
-        )
-
-        # An empty cc_library to make rule dependency consistent.
-        native.cc_library(
-            name = name,
-            **kwargs
-        )
-        return
-
-    grpc_cpp_plugin = None
-    if use_grpc_plugin:
-        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
-
-    gen_srcs = _CcSrcs(srcs, use_grpc_plugin)
-    gen_hdrs = _CcHdrs(srcs, use_grpc_plugin)
-    outs = gen_srcs + gen_hdrs
-
-    proto_gen(
-        name = name + "_genproto",
-        srcs = srcs,
-        deps = [s + "_genproto" for s in deps],
-        includes = includes,
-        protoc = protoc,
-        plugin = grpc_cpp_plugin,
-        plugin_language = "grpc",
-        gen_cc = 1,
-        outs = outs,
-        visibility = ["//visibility:public"],
-    )
-
-    if default_runtime and not default_runtime in cc_libs:
-        cc_libs = cc_libs + [default_runtime]
-    if use_grpc_plugin:
-        cc_libs = cc_libs + ["//external:grpc_lib"]
-
-    native.cc_library(
-        name = name,
-        srcs = gen_srcs,
-        hdrs = gen_hdrs,
-        deps = cc_libs + deps,
-        includes = includes,
-        alwayslink = 1,
-        **kwargs
-    )
-
-def internal_gen_well_known_protos_java(srcs):
-    """Bazel rule to generate the gen_well_known_protos_java genrule
-
-    Args:
-      srcs: the well known protos
-    """
-    root = Label("%s//protobuf_java" % (native.repository_name())).workspace_root
-    pkg = native.package_name() + "/" if native.package_name() else ""
-    if root == "":
-        include = " -I%ssrc " % pkg
-    else:
-        include = " -I%s/%ssrc " % (root, pkg)
-    native.genrule(
-        name = "gen_well_known_protos_java",
-        srcs = srcs,
-        outs = [
-            "wellknown.srcjar",
-        ],
-        cmd = "$(location :protoc) --java_out=$(@D)/wellknown.jar" +
-              " %s $(SRCS) " % include +
-              " && mv $(@D)/wellknown.jar $(@D)/wellknown.srcjar",
-        tools = [":protoc"],
-    )
-
-def internal_copied_filegroup(name, srcs, strip_prefix, dest, **kwargs):
-    """Macro to copy files to a different directory and then create a filegroup.
-
-    This is used by the //:protobuf_python py_proto_library target to work around
-    an issue caused by Python source files that are part of the same Python
-    package being in separate directories.
-
-    Args:
-      srcs: The source files to copy and add to the filegroup.
-      strip_prefix: Path to the root of the files to copy.
-      dest: The directory to copy the source files into.
-      **kwargs: extra arguments that will be passesd to the filegroup.
-    """
-    outs = [_RelativeOutputPath(s, strip_prefix, dest) for s in srcs]
-
-    native.genrule(
-        name = name + "_genrule",
-        srcs = srcs,
-        outs = outs,
-        cmd = " && ".join(
-            ["cp $(location %s) $(location %s)" %
-             (s, _RelativeOutputPath(s, strip_prefix, dest)) for s in srcs],
-        ),
-    )
-
-    native.filegroup(
-        name = name,
-        srcs = outs,
-        **kwargs
-    )
-
-def py_proto_library(
-        name,
-        srcs = [],
-        deps = [],
-        py_libs = [],
-        py_extra_srcs = [],
-        include = None,
-        default_runtime = "@com_google_protobuf//:protobuf_python",
-        protoc = "@com_google_protobuf//:protoc",
-        use_grpc_plugin = False,
-        **kwargs):
-    """Bazel rule to create a Python protobuf library from proto source files
-
-    NOTE: the rule is only an internal workaround to generate protos. The
-    interface may change and the rule may be removed when bazel has introduced
-    the native rule.
-
-    Args:
-      name: the name of the py_proto_library.
-      srcs: the .proto files of the py_proto_library.
-      deps: a list of dependency labels; must be py_proto_library.
-      py_libs: a list of other py_library targets depended by the generated
-          py_library.
-      py_extra_srcs: extra source files that will be added to the output
-          py_library. This attribute is used for internal bootstrapping.
-      include: a string indicating the include path of the .proto files.
-      default_runtime: the implicitly default runtime which will be depended on by
-          the generated py_library target.
-      protoc: the label of the protocol compiler to generate the sources.
-      use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
-          when processing the proto files.
-      **kwargs: other keyword arguments that are passed to py_library.
-
-    """
-    outs = _PyOuts(srcs, use_grpc_plugin)
-
-    includes = []
-    if include != None:
-        includes = [include]
-
-    grpc_python_plugin = None
-    if use_grpc_plugin:
-        grpc_python_plugin = "//external:grpc_python_plugin"
-        # Note: Generated grpc code depends on Python grpc module. This dependency
-        # is not explicitly listed in py_libs. Instead, host system is assumed to
-        # have grpc installed.
-
-    proto_gen(
-        name = name + "_genproto",
-        srcs = srcs,
-        deps = [s + "_genproto" for s in deps],
-        includes = includes,
-        protoc = protoc,
-        gen_py = 1,
-        outs = outs,
-        visibility = ["//visibility:public"],
-        plugin = grpc_python_plugin,
-        plugin_language = "grpc",
-    )
-
-    if default_runtime and not default_runtime in py_libs + deps:
-        py_libs = py_libs + [default_runtime]
-
-    native.py_library(
-        name = name,
-        srcs = outs + py_extra_srcs,
-        deps = py_libs + deps,
-        imports = includes,
-        **kwargs
-    )
-
-def internal_protobuf_py_tests(
-        name,
-        modules = [],
-        **kwargs):
-    """Bazel rules to create batch tests for protobuf internal.
-
-    Args:
-      name: the name of the rule.
-      modules: a list of modules for tests. The macro will create a py_test for
-          each of the parameter with the source "google/protobuf/%s.py"
-      **kwargs: extra parameters that will be passed into the py_test.
-
-    """
-    for m in modules:
-        s = "python/google/protobuf/internal/%s.py" % m
-        native.py_test(
-            name = "py_%s" % m,
-            srcs = [s],
-            main = s,
-            **kwargs
-        )
-
-def check_protobuf_required_bazel_version():
-    """For WORKSPACE files, to check the installed version of bazel.
-
-    This ensures bazel supports our approach to proto_library() depending on a
-    copied filegroup. (Fixed in bazel 0.5.4)
-    """
-    expected = apple_common.dotted_version("0.5.4")
-    current = apple_common.dotted_version(native.bazel_version)
-    if current.compare_to(expected) < 0:
-        fail("Bazel must be newer than 0.5.4")
diff --git a/third_party/xla/third_party/systemlibs/protobuf_deps.bzl b/third_party/xla/third_party/systemlibs/protobuf_deps.bzl
deleted file mode 100644
index aafd89b0aa1256..00000000000000
--- a/third_party/xla/third_party/systemlibs/protobuf_deps.bzl
+++ /dev/null
@@ -1,2 +0,0 @@
-def protobuf_deps():
-    pass
diff --git a/third_party/xla/third_party/systemlibs/pybind11.BUILD b/third_party/xla/third_party/systemlibs/pybind11.BUILD
deleted file mode 100644
index 9ea6b4186ee5f7..00000000000000
--- a/third_party/xla/third_party/systemlibs/pybind11.BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "pybind11",
-    deps = [
-        "@local_xla//third_party/python_runtime:headers",
-    ],
-)
diff --git a/third_party/xla/third_party/systemlibs/re2.BUILD b/third_party/xla/third_party/systemlibs/re2.BUILD
deleted file mode 100644
index c18e252dbc8330..00000000000000
--- a/third_party/xla/third_party/systemlibs/re2.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # BSD/MIT-like license
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "re2",
-    linkopts = ["-lre2"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/six.BUILD b/third_party/xla/third_party/systemlibs/six.BUILD
deleted file mode 100644
index ff9b1a540b224b..00000000000000
--- a/third_party/xla/third_party/systemlibs/six.BUILD
+++ /dev/null
@@ -1,11 +0,0 @@
-licenses(["notice"])  # MIT
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "six",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/snappy.BUILD b/third_party/xla/third_party/systemlibs/snappy.BUILD
deleted file mode 100644
index fd2db9e2df6752..00000000000000
--- a/third_party/xla/third_party/systemlibs/snappy.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # BSD 3-Clause
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "snappy",
-    linkopts = ["-lsnappy"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/sqlite.BUILD b/third_party/xla/third_party/systemlibs/sqlite.BUILD
deleted file mode 100644
index 88a84a961376b7..00000000000000
--- a/third_party/xla/third_party/systemlibs/sqlite.BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-licenses(["unencumbered"])  # Public Domain
-
-# Production build of SQLite library that's baked into TensorFlow.
-cc_library(
-    name = "org_sqlite",
-    linkopts = ["-lsqlite3"],
-    visibility = ["//visibility:public"],
-)
-
-# This is a Copybara sync helper for Google.
-py_library(
-    name = "python",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/syslibs_configure.bzl b/third_party/xla/third_party/systemlibs/syslibs_configure.bzl
deleted file mode 100644
index 822ecda6d69979..00000000000000
--- a/third_party/xla/third_party/systemlibs/syslibs_configure.bzl
+++ /dev/null
@@ -1,170 +0,0 @@
-"""Repository rule for system library autoconfiguration.
-
-`syslibs_configure` depends on the following environment variables:
-
-  * `TF_SYSTEM_LIBS`: list of third party dependencies that should use
-    the system version instead
-"""
-
-_TF_SYSTEM_LIBS = "TF_SYSTEM_LIBS"
-
-VALID_LIBS = [
-    "absl_py",
-    "astor_archive",
-    "astunparse_archive",
-    "boringssl",
-    "com_github_googlecloudplatform_google_cloud_cpp",
-    "com_github_grpc_grpc",
-    "com_google_absl",
-    "com_google_protobuf",
-    "com_googlesource_code_re2",
-    "curl",
-    "cython",
-    "dill_archive",
-    "double_conversion",
-    "flatbuffers",
-    "functools32_archive",
-    "gast_archive",
-    "gif",
-    "hwloc",
-    "icu",
-    "jsoncpp_git",
-    "libjpeg_turbo",
-    "nasm",
-    "nsync",
-    "org_sqlite",
-    "pasta",
-    "png",
-    "pybind11",
-    "six_archive",
-    "snappy",
-    "tblib_archive",
-    "termcolor_archive",
-    "typing_extensions_archive",
-    "wrapt",
-    "zlib",
-]
-
-def auto_configure_fail(msg):
-    """Output failure message when syslibs configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sSystem Library Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def _is_windows(repository_ctx):
-    """Returns true if the host operating system is windows."""
-    os_name = repository_ctx.os.name.lower()
-    if os_name.find("windows") != -1:
-        return True
-    return False
-
-def _enable_syslibs(repository_ctx):
-    s = repository_ctx.os.environ.get(_TF_SYSTEM_LIBS, "").strip()
-    if not _is_windows(repository_ctx) and s != None and s != "":
-        return True
-    return False
-
-def _get_system_lib_list(repository_ctx):
-    """Gets the list of deps that should use the system lib.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A string version of a python list
-    """
-    if _TF_SYSTEM_LIBS not in repository_ctx.os.environ:
-        return []
-
-    libenv = repository_ctx.os.environ[_TF_SYSTEM_LIBS].strip()
-    libs = []
-
-    for lib in list(libenv.split(",")):
-        lib = lib.strip()
-        if lib == "":
-            continue
-        if lib not in VALID_LIBS:
-            auto_configure_fail("Invalid system lib set: %s" % lib)
-            return []
-        libs.append(lib)
-
-    return libs
-
-def _format_system_lib_list(repository_ctx):
-    """Formats the list of deps that should use the system lib.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A list of the names of deps that should use the system lib.
-    """
-    libs = _get_system_lib_list(repository_ctx)
-    ret = ""
-    for lib in libs:
-        ret += "'%s',\n" % lib
-
-    return ret
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "")
-    repository_ctx.template(
-        out,
-        Label("//third_party/systemlibs%s.tpl" % tpl),
-        substitutions,
-        False,
-    )
-
-def _create_dummy_repository(repository_ctx):
-    """Creates the dummy repository to build with all bundled libraries."""
-
-    _tpl(repository_ctx, ":BUILD")
-    _tpl(
-        repository_ctx,
-        ":build_defs.bzl",
-        {
-            "%{syslibs_enabled}": "False",
-            "%{syslibs_list}": "",
-        },
-    )
-
-def _create_local_repository(repository_ctx):
-    """Creates the repository to build with system libraries."""
-
-    _tpl(repository_ctx, ":BUILD")
-    _tpl(
-        repository_ctx,
-        ":build_defs.bzl",
-        {
-            "%{syslibs_enabled}": "True",
-            "%{syslibs_list}": _format_system_lib_list(repository_ctx),
-        },
-    )
-
-def _syslibs_autoconf_impl(repository_ctx):
-    """Implementation of the syslibs_configure repository rule."""
-    if not _enable_syslibs(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        _create_local_repository(repository_ctx)
-
-syslibs_configure = repository_rule(
-    implementation = _syslibs_autoconf_impl,
-    environ = [
-        _TF_SYSTEM_LIBS,
-    ],
-)
-
-"""Configures the build to link to system libraries
-instead of using bundled versions.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-syslibs_configure(name = "local_config_syslibs")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/systemlibs/tblib.BUILD b/third_party/xla/third_party/systemlibs/tblib.BUILD
deleted file mode 100644
index ac411ce8705e53..00000000000000
--- a/third_party/xla/third_party/systemlibs/tblib.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # BSD 3-clause
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "tblib",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/typing_extensions.BUILD b/third_party/xla/third_party/systemlibs/typing_extensions.BUILD
deleted file mode 100644
index dc5d58ae6cdded..00000000000000
--- a/third_party/xla/third_party/systemlibs/typing_extensions.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-# Description:
-#   Backports for the typing module to older Python versions. See
-#   https://github.com/python/typing/blob/master/typing_extensions/README.rst
-
-licenses(["notice"])  # PSF
-
-py_library(
-    name = "typing_extensions",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/wrapt.BUILD b/third_party/xla/third_party/systemlibs/wrapt.BUILD
deleted file mode 100644
index 633feb20124d00..00000000000000
--- a/third_party/xla/third_party/systemlibs/wrapt.BUILD
+++ /dev/null
@@ -1,4 +0,0 @@
-py_library(
-    name = "wrapt",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/systemlibs/zlib.BUILD b/third_party/xla/third_party/systemlibs/zlib.BUILD
deleted file mode 100644
index 69462ae6cbc2fa..00000000000000
--- a/third_party/xla/third_party/systemlibs/zlib.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # BSD/MIT-like license (for zlib)
-
-filegroup(
-    name = "zlib.h",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "zlib",
-    linkopts = ["-lz"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/tensorrt/BUILD b/third_party/xla/third_party/tensorrt/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/tensorrt/BUILD.tpl b/third_party/xla/third_party/tensorrt/BUILD.tpl
deleted file mode 100644
index 7fa5935d3950ce..00000000000000
--- a/third_party/xla/third_party/tensorrt/BUILD.tpl
+++ /dev/null
@@ -1,60 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-config_setting(
-    name = "use_static_tensorrt",
-    define_values = {"TF_TENSORRT_STATIC":"1"},
-)
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include"
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = select({
-        ":use_static_tensorrt": [":tensorrt_static_lib"],
-        "//conditions:default": [":tensorrt_lib"],
-    }),
-    copts = cuda_default_copts(),
-    data = select({
-        ":use_static_tensorrt": [],
-        "//conditions:default": [":tensorrt_lib"],
-    }),
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        # TODO(b/174608722): fix this line.
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-py_library(
-    name = "tensorrt_config_py",
-    srcs = ["tensorrt/tensorrt_config.py"]
-)
-
-%{copy_rules}
diff --git a/third_party/xla/third_party/tensorrt/LICENSE b/third_party/xla/third_party/tensorrt/LICENSE
deleted file mode 100644
index 146d9b765c5db4..00000000000000
--- a/third_party/xla/third_party/tensorrt/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2018, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/xla/third_party/tensorrt/build_defs.bzl.tpl b/third_party/xla/third_party/tensorrt/build_defs.bzl.tpl
deleted file mode 100644
index 83fcc7d69717b1..00000000000000
--- a/third_party/xla/third_party/tensorrt/build_defs.bzl.tpl
+++ /dev/null
@@ -1,9 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false=[]):
-  """Tests whether TensorRT was enabled during the configure process."""
-  return %{if_tensorrt}
-
-def if_tensorrt_exec(if_true, if_false=[]):
-  """Synonym for if_tensorrt."""
-  return %{if_tensorrt}
diff --git a/third_party/xla/third_party/tensorrt/plugin.BUILD.tpl b/third_party/xla/third_party/tensorrt/plugin.BUILD.tpl
deleted file mode 100644
index 92028b5498e33b..00000000000000
--- a/third_party/xla/third_party/tensorrt/plugin.BUILD.tpl
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-%{oss_rules}
diff --git a/third_party/xla/third_party/tensorrt/plugin/BUILD b/third_party/xla/third_party/tensorrt/plugin/BUILD
deleted file mode 100644
index 2c76d3db31dd70..00000000000000
--- a/third_party/xla/third_party/tensorrt/plugin/BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-# NVIDIA TensorRT Open Source Plugins
-# This package contains build targets for select TensorRT plugins included in the
-# TensorRT open source repository.
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
-
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
-
-cuda_library(
-    name = "plugin_common",
-    srcs = [
-        "plugin/common/kernels/common.cu.cc",
-    ],
-    hdrs = [
-        "plugin/common/bboxUtils.h",
-        "plugin/common/checkMacrosPlugin.h",
-        "plugin/common/plugin.h",
-    ],
-    strip_include_prefix = "plugin/common",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_tensorrt//:tensorrt",
-        "@local_config_tensorrt//:tensorrt_headers",
-    ],
-)
-
-cc_library(
-    name = "nms_plugin_hdrs",
-    hdrs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cuda_library(
-    name = "nvinfer_plugin_nms",
-    srcs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.cu.cc",
-        "plugin/efficientNMSPlugin/efficientNMSInference.cu.h",
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.cpp",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-    hdrs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-    copts = cuda_default_copts(),
-    include_prefix = "third_party/tensorrt/plugin/efficientNMSPlugin",
-    strip_include_prefix = "plugin/efficientNMSPlugin",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":nms_plugin_hdrs",
-        ":plugin_common",
-        "@local_config_tensorrt//:tensorrt",
-        "@local_config_tensorrt//:tensorrt_headers",
-    ],
-)
diff --git a/third_party/xla/third_party/tensorrt/plugin/tensorrt_oss.patch b/third_party/xla/third_party/tensorrt/plugin/tensorrt_oss.patch
deleted file mode 100644
index 3380879dd4cbce..00000000000000
--- a/third_party/xla/third_party/tensorrt/plugin/tensorrt_oss.patch
+++ /dev/null
@@ -1,144 +0,0 @@
-diff --git a/plugin/common/checkMacrosPlugin.h b/plugin/common/checkMacrosPlugin.h
-index 2cff9f8..a803765 100644
---- a/plugin/common/checkMacrosPlugin.h
-+++ b/plugin/common/checkMacrosPlugin.h
-@@ -16,7 +16,7 @@
- #ifndef CHECK_MACROS_PLUGIN_H
- #define CHECK_MACROS_PLUGIN_H
- 
--#include "NvInfer.h"
-+#include "third_party/tensorrt/NvInfer.h"
- #include <sstream>
- 
- #ifndef TRT_CHECK_MACROS_H
-diff --git a/plugin/common/kernels/common.cu b/plugin/common/kernels/common.cu.cc
-similarity index 87%
-rename from plugin/common/kernels/common.cu
-rename to plugin/common/kernels/common.cu.cc
-index 7c8922a..9818a30 100755
---- a/plugin/common/kernels/common.cu
-+++ b/plugin/common/kernels/common.cu.cc
-@@ -18,7 +18,6 @@
- #include "cublas_v2.h"
- #include <cub/cub.cuh>
- #include <stdint.h>
--#include "kernel.h"
- #include "bboxUtils.h"
- 
- #define CUDA_MEM_ALIGN 256
-@@ -26,28 +25,7 @@
- // HASH
- unsigned int hash(const void* array_, size_t size)
- {
--    // Apply hashing only when debugging RPN codes.
--    if (DEBUG_ENABLE)
--    {
--        const char* array_const;
--        char* array;
--        cudaMallocHost((void**) &array, size);
--        cudaMemcpy(array, array_, size, cudaMemcpyDeviceToHost);
--        array_const = array;
--        unsigned int hash = 45599;
--        for (size_t i = 0; i < size; i++)
--        {
--            unsigned int value = array_const[i];
--            hash = hash * 1487 + value;
--            hash = hash * 317;
--            hash = hash % 105359;
--        }
--        return hash;
--    }
--    else
--    {
--        return 0;
--    }
-+    return 0;
- }
- 
- // ALIGNPTR
-diff --git a/plugin/common/plugin.h b/plugin/common/plugin.h
-index 27a1fb7..f056255 100644
---- a/plugin/common/plugin.h
-+++ b/plugin/common/plugin.h
-@@ -17,7 +17,7 @@
- #define TRT_PLUGIN_H
- #include "checkMacrosPlugin.h"
- 
--#include "NvInferPlugin.h"
-+#include "third_party/tensorrt/NvInferPlugin.h"
- #include <cstring>
- #include <cuda_runtime.h>
- #include <iostream>
-diff --git a/plugin/efficientNMSPlugin/efficientNMSInference.cu b/plugin/efficientNMSPlugin/efficientNMSInference.cu.cc
-similarity index 99%
-rename from plugin/efficientNMSPlugin/efficientNMSInference.cu
-rename to plugin/efficientNMSPlugin/efficientNMSInference.cu.cc
-index f02a2f8..44fa20b 100644
---- a/plugin/efficientNMSPlugin/efficientNMSInference.cu
-+++ b/plugin/efficientNMSPlugin/efficientNMSInference.cu.cc
-@@ -18,7 +18,7 @@
- #include "cub/cub.cuh"
- #include "cuda_runtime_api.h"
- 
--#include "efficientNMSInference.cuh"
-+#include "efficientNMSInference.cu.h"
- #include "efficientNMSInference.h"
- 
- using namespace nvinfer1;
-diff --git a/plugin/efficientNMSPlugin/efficientNMSInference.cuh b/plugin/efficientNMSPlugin/efficientNMSInference.cu.h
-similarity index 100%
-rename from plugin/efficientNMSPlugin/efficientNMSInference.cuh
-rename to plugin/efficientNMSPlugin/efficientNMSInference.cu.h
-diff --git a/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp b/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
-index 2d05c5c..acda183 100644
---- a/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
-+++ b/plugin/efficientNMSPlugin/efficientNMSPlugin.cpp
-@@ -31,11 +31,6 @@ const char* EFFICIENT_NMS_ONNX_PLUGIN_VERSION{"1"};
- const char* EFFICIENT_NMS_ONNX_PLUGIN_NAME{"EfficientNMS_ONNX_TRT"};
- } // namespace
- 
--PluginFieldCollection EfficientNMSPluginCreator::mFC{};
--PluginFieldCollection EfficientNMSONNXPluginCreator::mFC{};
--std::vector<PluginField> EfficientNMSPluginCreator::mPluginAttributes;
--std::vector<PluginField> EfficientNMSONNXPluginCreator::mPluginAttributes;
--
- EfficientNMSPlugin::EfficientNMSPlugin(EfficientNMSParameters param)
-     : mParam(param)
- {
-@@ -386,7 +381,7 @@ EfficientNMSPluginCreator::EfficientNMSPluginCreator()
-     mPluginAttributes.emplace_back(PluginField("max_output_boxes", nullptr, PluginFieldType::kINT32, 1));
-     mPluginAttributes.emplace_back(PluginField("background_class", nullptr, PluginFieldType::kINT32, 1));
-     mPluginAttributes.emplace_back(PluginField("score_activation", nullptr, PluginFieldType::kINT32, 1));
--    mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1));
-+    mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1));    
-     mFC.nbFields = mPluginAttributes.size();
-     mFC.fields = mPluginAttributes.data();
- }
-diff --git a/plugin/efficientNMSPlugin/efficientNMSPlugin.h b/plugin/efficientNMSPlugin/efficientNMSPlugin.h
-index b342b09..84d5e69 100644
---- a/plugin/efficientNMSPlugin/efficientNMSPlugin.h
-+++ b/plugin/efficientNMSPlugin/efficientNMSPlugin.h
-@@ -85,9 +85,9 @@ public:
-         const char* name, const void* serialData, size_t serialLength) noexcept override;
- 
- protected:
--    static PluginFieldCollection mFC;
-+    PluginFieldCollection mFC;
-     EfficientNMSParameters mParam;
--    static std::vector<PluginField> mPluginAttributes;
-+    std::vector<PluginField> mPluginAttributes;
-     std::string mPluginName;
- };
- 
-@@ -107,9 +107,9 @@ public:
-         const char* name, const void* serialData, size_t serialLength) noexcept override;
- 
- protected:
--    static PluginFieldCollection mFC;
-+    PluginFieldCollection mFC;
-     EfficientNMSParameters mParam;
--    static std::vector<PluginField> mPluginAttributes;
-+    std::vector<PluginField> mPluginAttributes;
-     std::string mPluginName;
- };
- 
diff --git a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
deleted file mode 100644
index 3706b291848399..00000000000000
--- a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
+++ /dev/null
@@ -1,337 +0,0 @@
-"""Repository rule for TensorRT configuration.
-
-`tensorrt_configure` depends on the following environment variables:
-
-  * `TF_TENSORRT_VERSION`: The TensorRT libnvinfer version.
-  * `TENSORRT_INSTALL_PATH`: The installation path of the TensorRT library.
-"""
-
-load(
-    "//third_party/gpus:cuda_configure.bzl",
-    "find_cuda_config",
-    "lib_name",
-    "make_copy_files_rule",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "get_cpu_value",
-    "get_host_environ",
-)
-
-_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
-_TF_TENSORRT_STATIC_PATH = "TF_TENSORRT_STATIC_PATH"
-_TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO"
-_TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
-_TF_NEED_TENSORRT = "TF_NEED_TENSORRT"
-
-_TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin"]
-_TF_TENSORRT_HEADERS = ["NvInfer.h", "NvUtils.h", "NvInferPlugin.h"]
-_TF_TENSORRT_HEADERS_V6 = [
-    "NvInfer.h",
-    "NvUtils.h",
-    "NvInferPlugin.h",
-    "NvInferVersion.h",
-    "NvInferRuntime.h",
-    "NvInferRuntimeCommon.h",
-    "NvInferPluginUtils.h",
-]
-_TF_TENSORRT_HEADERS_V8 = [
-    "NvInfer.h",
-    "NvInferLegacyDims.h",
-    "NvInferImpl.h",
-    "NvUtils.h",
-    "NvInferPlugin.h",
-    "NvInferVersion.h",
-    "NvInferRuntime.h",
-    "NvInferRuntimeCommon.h",
-    "NvInferPluginUtils.h",
-]
-_TF_TENSORRT_HEADERS_V8_6 = [
-    "NvInfer.h",
-    "NvInferConsistency.h",
-    "NvInferConsistencyImpl.h",
-    "NvInferImpl.h",
-    "NvInferLegacyDims.h",
-    "NvInferPlugin.h",
-    "NvInferPluginUtils.h",
-    "NvInferRuntime.h",
-    "NvInferRuntimeBase.h",
-    "NvInferRuntimeCommon.h",
-    "NvInferRuntimePlugin.h",
-    "NvInferSafeRuntime.h",
-    "NvInferVersion.h",
-    "NvUtils.h",
-]
-
-_DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
-_DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
-_DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
-
-_TENSORRT_OSS_DUMMY_BUILD_CONTENT = """
-cc_library(
-  name = "nvinfer_plugin_nms",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_TENSORRT_OSS_ARCHIVE_BUILD_CONTENT = """
-alias(
-  name = "nvinfer_plugin_nms",
-  actual = "@tensorrt_oss_archive//:nvinfer_plugin_nms",
-  visibility = ["//visibility:public"],
-)
-"""
-
-def _at_least_version(actual_version, required_version):
-    actual = [int(v) for v in actual_version.split(".")]
-    required = [int(v) for v in required_version.split(".")]
-    return actual >= required
-
-def _get_tensorrt_headers(tensorrt_version):
-    if _at_least_version(tensorrt_version, "8.6"):
-        return _TF_TENSORRT_HEADERS_V8_6
-    if _at_least_version(tensorrt_version, "8"):
-        return _TF_TENSORRT_HEADERS_V8
-    if _at_least_version(tensorrt_version, "6"):
-        return _TF_TENSORRT_HEADERS_V6
-    return _TF_TENSORRT_HEADERS
-
-def _tpl_path(repository_ctx, filename):
-    return repository_ctx.path(Label("//third_party/tensorrt:%s.tpl" % filename))
-
-def _tpl(repository_ctx, tpl, substitutions):
-    repository_ctx.template(
-        tpl,
-        _tpl_path(repository_ctx, tpl),
-        substitutions,
-    )
-
-def _create_dummy_repository(repository_ctx):
-    """Create a dummy TensorRT repository."""
-    _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
-    _tpl(repository_ctx, "BUILD", {
-        "%{copy_rules}": "",
-        "\":tensorrt_include\"": "",
-        "\":tensorrt_lib\"": "",
-        "%{oss_rules}": _TENSORRT_OSS_DUMMY_BUILD_CONTENT,
-    })
-    _tpl(repository_ctx, "tensorrt/include/tensorrt_config.h", {
-        "%{tensorrt_version}": "",
-    })
-
-    # Copy license file in non-remote build.
-    repository_ctx.template(
-        "LICENSE",
-        Label("//third_party/tensorrt:LICENSE"),
-        {},
-    )
-
-    # Set up tensorrt_config.py, which is used by gen_build_info to provide
-    # build environment info to the API
-    _tpl(
-        repository_ctx,
-        "tensorrt/tensorrt_config.py",
-        _py_tmpl_dict({}),
-    )
-
-def enable_tensorrt(repository_ctx):
-    """Returns whether to build with TensorRT support."""
-    return int(get_host_environ(repository_ctx, _TF_NEED_TENSORRT, False))
-
-def _get_tensorrt_static_path(repository_ctx):
-    """Returns the path for TensorRT static libraries."""
-    return get_host_environ(repository_ctx, _TF_TENSORRT_STATIC_PATH, None)
-
-def _get_tensorrt_full_version(repository_ctx):
-    """Returns the full version for TensorRT."""
-    return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION, None)
-
-def _create_local_tensorrt_repository(repository_ctx):
-    tpl_paths = {
-        "build_defs.bzl": _tpl_path(repository_ctx, "build_defs.bzl"),
-        "BUILD": _tpl_path(repository_ctx, "BUILD"),
-        "tensorrt/include/tensorrt_config.h": _tpl_path(repository_ctx, "tensorrt/include/tensorrt_config.h"),
-        "tensorrt/tensorrt_config.py": _tpl_path(repository_ctx, "tensorrt/tensorrt_config.py"),
-        "plugin.BUILD": _tpl_path(repository_ctx, "plugin.BUILD"),
-    }
-
-    config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"])
-    cuda_version = config["cuda_version"]
-    cuda_library_path = config["cuda_library_dir"] + "/"
-    trt_version = config["tensorrt_version"]
-    trt_full_version = _get_tensorrt_full_version(repository_ctx)
-    cpu_value = get_cpu_value(repository_ctx)
-
-    # Copy the library and header files.
-    libraries = [lib_name(lib, cpu_value, trt_version) for lib in _TF_TENSORRT_LIBS]
-
-    library_dir = config["tensorrt_library_dir"] + "/"
-    headers = _get_tensorrt_headers(trt_version)
-    include_dir = config["tensorrt_include_dir"] + "/"
-    copy_rules = [
-        make_copy_files_rule(
-            repository_ctx,
-            name = "tensorrt_lib",
-            srcs = [library_dir + library for library in libraries],
-            outs = ["tensorrt/lib/" + library for library in libraries],
-        ),
-        make_copy_files_rule(
-            repository_ctx,
-            name = "tensorrt_include",
-            srcs = [include_dir + header for header in headers],
-            outs = ["tensorrt/include/" + header for header in headers],
-        ),
-    ]
-
-    tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)
-    if tensorrt_static_path:
-        tensorrt_static_path = tensorrt_static_path + "/"
-        if _at_least_version(trt_full_version, "8.4.1") and _at_least_version(cuda_version, "11.4"):
-            raw_static_library_names = _TF_TENSORRT_LIBS
-            nvrtc_ptxjit_static_raw_names = ["nvrtc", "nvrtc-builtins", "nvptxcompiler"]
-            nvrtc_ptxjit_static_names = ["%s_static" % name for name in nvrtc_ptxjit_static_raw_names]
-            nvrtc_ptxjit_static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in nvrtc_ptxjit_static_names]
-        elif _at_least_version(trt_version, "8"):
-            raw_static_library_names = _TF_TENSORRT_LIBS
-            nvrtc_ptxjit_static_libraries = []
-        else:
-            raw_static_library_names = _TF_TENSORRT_LIBS + ["nvrtc", "myelin_compiler", "myelin_executor", "myelin_pattern_library", "myelin_pattern_runtime"]
-            nvrtc_ptxjit_static_libraries = []
-        static_library_names = ["%s_static" % name for name in raw_static_library_names]
-        static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in static_library_names]
-        copy_rules = copy_rules + [
-            make_copy_files_rule(
-                repository_ctx,
-                name = "tensorrt_static_lib",
-                srcs = [tensorrt_static_path + library for library in static_libraries] +
-                       [cuda_library_path + library for library in nvrtc_ptxjit_static_libraries],
-                outs = ["tensorrt/lib/" + library for library in static_libraries] +
-                       ["tensorrt/lib/" + library for library in nvrtc_ptxjit_static_libraries],
-            ),
-        ]
-
-    # Set up config file.
-    repository_ctx.template(
-        "build_defs.bzl",
-        tpl_paths["build_defs.bzl"],
-        {"%{if_tensorrt}": "if_true"},
-    )
-
-    # Set up BUILD file.
-    repository_ctx.template(
-        "BUILD",
-        tpl_paths["BUILD"],
-        {
-            "%{copy_rules}": "\n".join(copy_rules),
-        },
-    )
-
-    # Set up the plugins folder BUILD file.
-    repository_ctx.template(
-        "plugin/BUILD",
-        tpl_paths["plugin.BUILD"],
-        {
-            "%{oss_rules}": _TENSORRT_OSS_ARCHIVE_BUILD_CONTENT,
-        },
-    )
-
-    # Copy license file in non-remote build.
-    repository_ctx.template(
-        "LICENSE",
-        Label("//third_party/tensorrt:LICENSE"),
-        {},
-    )
-
-    # Set up tensorrt_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "tensorrt/include/tensorrt_config.h",
-        tpl_paths["tensorrt/include/tensorrt_config.h"],
-        {"%{tensorrt_version}": trt_version},
-    )
-
-    # Set up tensorrt_config.py, which is used by gen_build_info to provide
-    # build environment info to the API
-    repository_ctx.template(
-        "tensorrt/tensorrt_config.py",
-        tpl_paths["tensorrt/tensorrt_config.py"],
-        _py_tmpl_dict({
-            "tensorrt_version": trt_version,
-        }),
-    )
-
-def _py_tmpl_dict(d):
-    return {"%{tensorrt_config}": str(d)}
-
-def _tensorrt_configure_impl(repository_ctx):
-    """Implementation of the tensorrt_configure repository rule."""
-
-    if get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO) != None:
-        # Forward to the pre-configured remote repository.
-        remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
-        repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-        repository_ctx.template(
-            "build_defs.bzl",
-            config_repo_label(remote_config_repo, ":build_defs.bzl"),
-            {},
-        )
-        repository_ctx.template(
-            "tensorrt/include/tensorrt_config.h",
-            config_repo_label(remote_config_repo, ":tensorrt/include/tensorrt_config.h"),
-            {},
-        )
-        repository_ctx.template(
-            "tensorrt/tensorrt_config.py",
-            config_repo_label(remote_config_repo, ":tensorrt/tensorrt_config.py"),
-            {},
-        )
-        repository_ctx.template(
-            "LICENSE",
-            config_repo_label(remote_config_repo, ":LICENSE"),
-            {},
-        )
-        return
-
-    if not enable_tensorrt(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-        return
-
-    _create_local_tensorrt_repository(repository_ctx)
-
-_ENVIRONS = [
-    _TENSORRT_INSTALL_PATH,
-    _TF_TENSORRT_VERSION,
-    _TF_NEED_TENSORRT,
-    _TF_TENSORRT_STATIC_PATH,
-    "TF_CUDA_PATHS",
-]
-
-remote_tensorrt_configure = repository_rule(
-    implementation = _create_local_tensorrt_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
-    },
-)
-
-tensorrt_configure = repository_rule(
-    implementation = _tensorrt_configure_impl,
-    environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
-    attrs = {
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
-    },
-)
-"""Detects and configures the local CUDA toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-tensorrt_configure(name = "local_config_tensorrt")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/tensorrt/workspace.bzl b/third_party/xla/third_party/tensorrt/workspace.bzl
deleted file mode 100644
index be383ee69eaca8..00000000000000
--- a/third_party/xla/third_party/tensorrt/workspace.bzl
+++ /dev/null
@@ -1,21 +0,0 @@
-"""Provides the repository macro to import TensorRT Open Source components."""
-
-load("//third_party:repo.bzl", "tf_http_archive")
-
-def repo(name = "tensorrt_oss_archive"):
-    """Imports TensorRT Open Source Components."""
-    TRT_OSS_COMMIT = "9ec6eb6db39188c9f3d25f49c8ee3a9721636b56"
-    TRT_OSS_SHA256 = "4fa2a712a5f2350b81df01d55c1dc17451e09efd4b2a53322b0433721009e1c7"
-
-    tf_http_archive(
-        name = name,
-        sha256 = TRT_OSS_SHA256,
-        strip_prefix = "TensorRT-{commit}".format(commit = TRT_OSS_COMMIT),
-        urls = [
-            # TODO: Google Mirror "https://storage.googleapis.com/...."
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
-            "https://github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
-        ],
-        build_file = "//third_party/tensorrt/plugin:BUILD",
-        patch_file = ["//third_party/tensorrt/plugin:tensorrt_oss.patch"],
-    )
diff --git a/third_party/xla/third_party/tf_runtime/BUILD b/third_party/xla/third_party/tf_runtime/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tf_runtime/workspace.bzl
deleted file mode 100644
index af9d9dcce05374..00000000000000
--- a/third_party/xla/third_party/tf_runtime/workspace.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Provides the repository macro to import TFRT."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports TFRT."""
-
-    # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "f6b5570b2e04978d6362a0f307982c56fb0e01cd"
-    TFRT_SHA256 = "d16308e600db822a7b2e21921febc5d34cc3fcf0197739e2c258b32083cc5da4"
-
-    tf_http_archive(
-        name = "tf_runtime",
-        sha256 = TFRT_SHA256,
-        strip_prefix = "runtime-{commit}".format(commit = TFRT_COMMIT),
-        urls = tf_mirror_urls("https://github.com/tensorflow/runtime/archive/{commit}.tar.gz".format(commit = TFRT_COMMIT)),
-        # A patch file can be provided for atomic commits to both TF and TFRT.
-        # The job that bumps the TFRT_COMMIT also resets patch_file to 'None'.
-        patch_file = None,
-    )
diff --git a/third_party/xla/third_party/triton/cl576548341.patch b/third_party/xla/third_party/triton/cl576548341.patch
new file mode 100644
index 00000000000000..3efc805515dbb5
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl576548341.patch
@@ -0,0 +1,16 @@
+diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+@@ -362,8 +362,10 @@ Value addStringToModule(Location loc, Co
+   }
+ 
+   Value zero = i32_val(0);
+-  Value globalPtr =
+-      rewriter.create<LLVM::AddressOfOp>(UnknownLoc::get(ctx), global);
++  Type globalPtrType =
++      LLVM::LLVMPointerType::get(globalType, global.getAddrSpace());
++  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
++      UnknownLoc::get(ctx), globalPtrType, global.getSymName());
+   Value stringStart =
+       rewriter.create<LLVM::GEPOp>(UnknownLoc::get(ctx), ptr_ty(i8_ty),
+                                    globalPtr, SmallVector<Value>({zero, zero}));
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 150c30b6d20e43..2c6fc5fdba7dc6 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl574102539"
-    TRITON_SHA256 = "6c967d80f3044991b09ed309e668858cfc5d772899e410e918cf5cb463524744"
+    TRITON_COMMIT = "cl575842988"
+    TRITON_SHA256 = "caa815ec863182eb3745fdc0884f521d622aa2b37be521b850f7ea330cadc923"
 
     tf_http_archive(
         name = "triton",
@@ -17,5 +17,6 @@ def repo():
         patch_file = [
             "//third_party/triton:cl568176943.patch",
             "//third_party/triton:b304456327.patch",
+            "//third_party/triton:cl576548341.patch",
         ],
     )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 70353785170768..e3d0a021f55ae5 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -55,6 +55,7 @@
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
+#     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
 #     rbe_win_py39: Windows Python 3.9 RBE config
 #
@@ -237,9 +238,12 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
-# TODO(angerson, perfinion): What does sm_ vs compute_ mean? How can users
-# select a good value for this? See go/tf-pip-cuda
-build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+# `compute_XY` enables PTX embedding in addition to SASS. PTX
+# is forward compatible beyond the current compute capability major
+# release while SASS is only forward compatible inside the current
+# major release. Example: sm_80 kernels can run on sm_89 GPUs but
+# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
+build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
@@ -249,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 # Debug config
 build:dbg -c dbg
@@ -482,12 +486,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -496,7 +500,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -517,11 +521,40 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
+build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
+build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
+build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
+build:rbe_linux_cuda_nvcc --config=tensorrt
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
+build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
+build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
+build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --config=rbe_linux
+build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
+build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
+build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
+# These you may need to change for your own GCP project.
+common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
+build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
+build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
+test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
 build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
@@ -576,8 +609,6 @@ try-import %workspace%/.bazelrc.user
 # Here are bazelrc configs for release builds
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
-# TODO(b/294367488) disable after 2.15 brancut
-test:release_base --flaky_test_attempts=3
 
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
@@ -615,7 +646,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 6966f102497c0a..e4974e79805725 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -8,6 +8,12 @@ third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
 third_party/curl.BUILD:
 third_party/cython.BUILD:
+third_party/ducc/BUILD:
+third_party/ducc/ducc0_custom_lowlevel_threading.h:
+third_party/ducc/fft.cc:
+third_party/ducc/fft.h:
+third_party/ducc/threading.cc:
+third_party/ducc/threading.h:
 third_party/eigen3/BUILD:
 third_party/eigen3/LICENSE:
 third_party/eigen3/eigen_archive.BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/BUILD b/third_party/xla/third_party/tsl/third_party/ducc/BUILD
new file mode 100644
index 00000000000000..073696a8255394
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/BUILD
@@ -0,0 +1 @@
+# DUCC FFT library (https://gitlab.mpcdf.mpg.de/mtr/ducc).
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD b/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD
new file mode 100644
index 00000000000000..8d713928f0f245
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD
@@ -0,0 +1,76 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+exports_files(["LICENSE"])
+
+DUCC_COPTS = [
+    "-frtti",
+    "-fexceptions",
+    "-ffp-contract=fast",
+]
+
+# This library exposes the raw DUCC fft API.  It should be used
+# with caution, since inclusion of the headers will require any
+# dependent targets to be build with exceptions and RTTI enabled.
+# For a better-isolated target, use ":fft_wrapper".
+cc_library(
+    name = "fft",
+    srcs = [
+        "google/ducc0_custom_lowlevel_threading.h",
+        "google/threading.cc",
+        "src/ducc0/infra/aligned_array.h",
+        "src/ducc0/infra/error_handling.h",
+        "src/ducc0/infra/misc_utils.h",
+        "src/ducc0/infra/simd.h",
+        "src/ducc0/infra/threading.cc",
+        "src/ducc0/infra/useful_macros.h",
+        "src/ducc0/math/cmplx.h",
+        "src/ducc0/math/unity_roots.h",
+    ],
+    hdrs = [
+        "google/threading.h",
+        "src/ducc0/fft/fft.h",
+        "src/ducc0/fft/fft1d_impl.h",
+        "src/ducc0/fft/fftnd_impl.h",
+        "src/ducc0/infra/mav.h",
+        "src/ducc0/infra/threading.h",
+    ],
+    copts = DUCC_COPTS,
+    defines = [
+        # Use custom TSL/Eigen threading.
+        "DUCC0_CUSTOM_LOWLEVEL_THREADING=1",
+    ],
+    features = ["-use_header_modules"],
+    include_prefix = "ducc",
+    includes = [
+        ".",  # Needed for google/-relative paths.
+        "google",  # Needed for finding ducc0_custom_lowlevel_threading.h.
+        "src",  # Needed for internal headers.
+    ],
+    # The DUCC FFT source files are dual-licensed as BSD 3 clause and GPLv2.
+    # We choose BSD 3 clause.
+    licenses = ["notice"],
+    visibility = ["//visibility:private"],
+    deps = [
+        # Required for custom threadpool usage:
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:mutex",
+    ],
+)
+
+cc_library(
+    name = "fft_wrapper",
+    srcs = ["google/fft.cc"],
+    hdrs = ["google/fft.h"],
+    copts = DUCC_COPTS,
+    features = ["-use_header_modules"],
+    include_prefix = "ducc",
+    licenses = ["notice"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":fft",
+        "@eigen_archive//:eigen3",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h b/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h
new file mode 100644
index 00000000000000..688efe75b59585
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/ducc0_custom_lowlevel_threading.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
+
+#include "tsl/platform/mutex.h"
+
+namespace ducc0 {
+namespace detail_threading {
+
+using Mutex = tsl::mutex;
+using UniqueLock = tsl::mutex_lock;
+using LockGuard = tsl::mutex_lock;
+using CondVar = tsl::condition_variable;
+
+// Missing variable used by DUCC threading.cc.
+extern thread_local bool in_parallel_region;
+
+}  // namespace detail_threading
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/fft.cc b/third_party/xla/third_party/tsl/third_party/ducc/fft.cc
new file mode 100644
index 00000000000000..ec3c66f263b8a5
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/fft.cc
@@ -0,0 +1,148 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "ducc/google/fft.h"
+
+#include <complex>
+#include <cstddef>
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include "ducc/google/threading.h"
+#include "ducc/src/ducc0/fft/fft.h"
+#include "ducc/src/ducc0/fft/fft1d_impl.h"  // IWYU pragma: keep, DUCC definitions.
+#include "ducc/src/ducc0/fft/fftnd_impl.h"  // IWYU pragma: keep, DUCC definitions.
+#include "ducc/src/ducc0/infra/mav.h"
+#include "ducc/src/ducc0/infra/threading.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+
+// Wrappers around DUCC calls.
+namespace google {
+
+using Shape = std::vector<std::size_t>;
+using Stride = std::vector<std::ptrdiff_t>;
+
+template <typename RealScalar>
+void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, std::complex<RealScalar>* out,
+         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
+         bool forward, RealScalar scale,
+         Eigen::ThreadPoolInterface* thread_pool) {
+  ducc0::cfmav<std::complex<RealScalar>> m_in(in, in_shape, in_stride);
+  ducc0::vfmav<std::complex<RealScalar>> m_out(out, out_shape, out_stride);
+
+  try {
+    if (thread_pool == nullptr) {
+      // Use a fake threadpool.
+      ducc0::google::NoThreadPool no_thread_pool;
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          no_thread_pool);
+      ducc0::c2c(m_in, m_out, axes, forward, scale, 1);
+    } else {
+      EigenThreadPool eigen_thread_pool(*thread_pool);
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          eigen_thread_pool);
+      ducc0::c2c(m_in, m_out, axes, forward, scale,
+                 eigen_thread_pool.nthreads());
+    }
+  } catch (const std::exception& ex) {
+    std::cerr << "DUCC FFT c2c failed: " << ex.what() << std::endl;
+    std::abort();
+  }
+}
+
+template <typename RealScalar>
+void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
+         std::complex<RealScalar>* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool) {
+  ducc0::cfmav<RealScalar> m_in(in, in_shape, in_stride);
+  ducc0::vfmav<std::complex<RealScalar>> m_out(out, out_shape, out_stride);
+
+  try {
+    if (thread_pool == nullptr) {
+      // Use a fake threadpool.
+      ducc0::google::NoThreadPool no_thread_pool;
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          no_thread_pool);
+      ducc0::r2c(m_in, m_out, axes, forward, scale, 1);
+    } else {
+      EigenThreadPool eigen_thread_pool(*thread_pool);
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          eigen_thread_pool);
+      ducc0::r2c(m_in, m_out, axes, forward, scale,
+                 eigen_thread_pool.nthreads());
+    }
+  } catch (const std::exception& ex) {
+    std::cerr << "DUCC FFT r2c failed: " << ex.what() << std::endl;
+    std::abort();
+  }
+}
+
+template <typename RealScalar>
+void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool) {
+  ducc0::cfmav<std::complex<RealScalar>> m_in(in, in_shape, in_stride);
+  ducc0::vfmav<RealScalar> m_out(out, out_shape, out_stride);
+
+  try {
+    if (thread_pool == nullptr) {
+      // Use a fake threadpool.
+      ducc0::google::NoThreadPool no_thread_pool;
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          no_thread_pool);
+      ducc0::c2r(m_in, m_out, axes, forward, scale, 1);
+    } else {
+      EigenThreadPool eigen_thread_pool(*thread_pool);
+      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
+          eigen_thread_pool);
+      ducc0::c2r(m_in, m_out, axes, forward, scale,
+                 eigen_thread_pool.nthreads());
+    }
+  } catch (const std::exception& ex) {
+    std::cerr << "DUCC FFT c2r failed: " << ex.what() << std::endl;
+    std::abort();
+  }
+}
+
+#define FFT_DEFINITIONS(RealScalar)                                            \
+  template void c2c<RealScalar>(                                               \
+      const std::complex<RealScalar>* in, const Shape& in_shape,               \
+      const Stride& in_stride, std::complex<RealScalar>* out,                  \
+      const Shape& out_shape, const Stride& out_stride, const Shape& axes,     \
+      bool forward, RealScalar scale,                                          \
+      Eigen::ThreadPoolInterface* thread_pool);                                \
+  template void r2c<RealScalar>(                                               \
+      const RealScalar* in, const Shape& in_shape, const Stride& in_stride,    \
+      std::complex<RealScalar>* out, const Shape& out_shape,                   \
+      const Stride& out_stride, const Shape& axes, bool forward,               \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);              \
+  template void c2r(const std::complex<RealScalar>* in, const Shape& in_shape, \
+                    const Stride& in_stride, RealScalar* out,                  \
+                    const Shape& out_shape, const Stride& out_stride,          \
+                    const Shape& axes, bool forward, RealScalar scale,         \
+                    Eigen::ThreadPoolInterface* thread_pool)
+FFT_DEFINITIONS(float);
+FFT_DEFINITIONS(double);
+#undef FFT_DEFINITIONS
+
+}  // namespace google
+}  // namespace ducc0
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/fft.h b/third_party/xla/third_party/tsl/third_party/ducc/fft.h
new file mode 100644
index 00000000000000..8c1691d36cadd8
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/fft.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+#define THIRD_PARTY_DUCC_GOOGLE_FFT_H_
+
+// Wrapper around the DUCC FFT library to isolate usage of exceptions
+// and RTTI.  Eliminates all direct usage of DUCC headers.
+
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using Shape = std::vector<std::size_t>;
+using Stride = std::vector<std::ptrdiff_t>;
+
+template <typename RealScalar>
+void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, std::complex<RealScalar>* out,
+         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
+         bool forward, RealScalar scale,
+         Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
+         std::complex<RealScalar>* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+template <typename RealScalar>
+void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
+         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
+         const Stride& out_stride, const Shape& axes, bool forward,
+         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
+
+#define FFT_DECLARATIONS(RealScalar)                                        \
+  extern template void c2c<RealScalar>(                                     \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, std::complex<RealScalar>* out,               \
+      const Shape& out_shape, const Stride& out_stride, const Shape& axes,  \
+      bool forward, RealScalar scale,                                       \
+      Eigen::ThreadPoolInterface* thread_pool);                             \
+  extern template void r2c<RealScalar>(                                     \
+      const RealScalar* in, const Shape& in_shape, const Stride& in_stride, \
+      std::complex<RealScalar>* out, const Shape& out_shape,                \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);           \
+  extern template void c2r(                                                 \
+      const std::complex<RealScalar>* in, const Shape& in_shape,            \
+      const Stride& in_stride, RealScalar* out, const Shape& out_shape,     \
+      const Stride& out_stride, const Shape& axes, bool forward,            \
+      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool)
+FFT_DECLARATIONS(float);
+FFT_DECLARATIONS(double);
+#undef FFT_DECLARATIONS
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_FFT_H_
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/threading.cc b/third_party/xla/third_party/tsl/third_party/ducc/threading.cc
new file mode 100644
index 00000000000000..d0793984ab5dbb
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/threading.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "ducc/google/threading.h"
+
+#include <thread>
+#include <utility>
+
+#include "ducc/src/ducc0/infra/threading.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+
+namespace google {
+
+namespace {
+
+// Default shared global pool.  It is created on first use.
+EigenThreadPool* GetGlobalThreadPoolSingleton() {
+  static Eigen::ThreadPool* eigen_pool =
+      new Eigen::ThreadPool(std::thread::hardware_concurrency());
+  static EigenThreadPool* pool = new EigenThreadPool(*eigen_pool);
+  return pool;
+}
+
+// Thread-local active pool for current execution.
+ducc0::detail_threading::thread_pool*& GetActiveThreadPoolSingleton() {
+  thread_local thread_pool* active_pool = nullptr;
+  return active_pool;
+}
+
+}  // namespace
+}  // namespace google
+
+// Implementations required by ducc0.
+namespace detail_threading {
+
+// Missing variable used by DUCC threading.cc.
+thread_local bool in_parallel_region = false;
+
+thread_pool* set_active_pool(thread_pool* new_pool) {
+  return std::exchange(ducc0::google::GetActiveThreadPoolSingleton(), new_pool);
+}
+
+thread_pool* get_active_pool() {
+  thread_pool* pool = google::GetActiveThreadPoolSingleton();
+  if (pool == nullptr) {
+    // Set to use a global pool.  This may trigger threadpool creation.
+    // Since the active pool is thread-local, this is thread-safe.
+    pool = google::GetGlobalThreadPoolSingleton();
+    set_active_pool(pool);
+  }
+  return pool;
+}
+
+}  // namespace detail_threading
+}  // namespace ducc0
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/threading.h b/third_party/xla/third_party/tsl/third_party/ducc/threading.h
new file mode 100644
index 00000000000000..a374e3d7b3d0d4
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/threading.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+#define THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
+
+#include "ducc/src/ducc0/infra/threading.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+namespace ducc0 {
+namespace google {
+
+using std::size_t;
+
+// Pseudo thread-pool for single-threaded execution.
+class NoThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  size_t nthreads() const override { return 1; }
+  size_t adjust_nthreads(size_t nthreads_in) const override { return 1; };
+  void submit(std::function<void()> work) override { work(); }
+};
+
+// Thread-pool wrapper around Eigen's ThreadPool.
+class EigenThreadPool : public ducc0::detail_threading::thread_pool {
+ public:
+  EigenThreadPool(Eigen::ThreadPoolInterface& pool) : pool_{&pool} {}
+  size_t nthreads() const override { return pool_->NumThreads(); }
+  size_t adjust_nthreads(size_t nthreads_in) const override {
+    // If called by a thread in the pool, return 1
+    if (pool_->CurrentThreadId() >= 0) {
+      return 1;
+    } else if (nthreads_in == 0) {
+      return pool_->NumThreads();
+    }
+    return std::min<size_t>(nthreads_in, pool_->NumThreads());
+  };
+  void submit(std::function<void()> work) override {
+    pool_->Schedule(std::move(work));
+  }
+
+ private:
+  Eigen::ThreadPoolInterface* pool_;
+};
+
+}  // namespace google
+}  // namespace ducc0
+
+#endif  // THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl b/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl
new file mode 100644
index 00000000000000..14755791d51589
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl
@@ -0,0 +1,21 @@
+"""Distinctly Useful Code Collection (DUCC) - CPU FFT Module"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    DUCC_COMMIT = "3d28aadfd8bb0219e3df188613dbbcdfffccc3cd"
+    DUCC_SHA256 = "eb044dd11374ed894d67081109d4aa7ed55c29fe3286b116f13db70da6af336c"
+    tf_http_archive(
+        name = "ducc",
+        strip_prefix = "ducc-{commit}".format(commit = DUCC_COMMIT),
+        sha256 = DUCC_SHA256,
+        urls = tf_mirror_urls("https://gitlab.mpcdf.mpg.de/mtr/ducc/-/archive/{commit}/ducc-{commit}.tar.gz".format(commit = DUCC_COMMIT)),
+        build_file = "//third_party/ducc:ducc.BUILD",
+        link_files = {
+            "//third_party/ducc:ducc0_custom_lowlevel_threading.h": "google/ducc0_custom_lowlevel_threading.h",
+            "//third_party/ducc:fft.h": "google/fft.h",
+            "//third_party/ducc:fft.cc": "google/fft.cc",
+            "//third_party/ducc:threading.cc": "google/threading.cc",
+            "//third_party/ducc:threading.h": "google/threading.h",
+        },
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
index 05293fd50bdcf2..1813bac9e55728 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -19,7 +19,7 @@ load(
 
 NCCL_MAJOR = 2
 
-NCCL_MINOR = 16
+NCCL_MINOR = 18
 
 NCCL_PATCH = 5
 
@@ -210,6 +210,10 @@ cuda_library(
     ],
     include_prefix = "third_party/nccl",
     linkopts = ["-lrt"],
+    # The following definition is needed to enable placeholder literals such as
+    # PRIx64 defined at the inttypes.h since Tensorflow docker image uses
+    # an old version of glibc.
+    local_defines = ["__STDC_FORMAT_MACROS"],
     strip_include_prefix = "src",
     target_compatible_with = select({
         "@local_config_cuda//cuda:using_clang": [],
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
index f951a6a4dde608..8ef0af95a1c6c3 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
@@ -30,19 +30,6 @@ diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrec
 similarity index 100%
 rename from src/collectives/device/sendrecv.cu
 rename to src/collectives/device/sendrecv.cu.cc
-diff --git a/src/include/nvtx.h b/src/include/nvtx.h
-index 2aeb932..cdc67d2 100644
---- a/src/include/nvtx.h
-+++ b/src/include/nvtx.h
-@@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
-
- class payload_schema {
-  public:
--  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-   {
-     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
 diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
 index accf8371a..4ab1bfac6 100644
 --- a/src/collectives/device/common.h
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index af9d9dcce05374..fb0399f9e1bdb2 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "f6b5570b2e04978d6362a0f307982c56fb0e01cd"
-    TFRT_SHA256 = "d16308e600db822a7b2e21921febc5d34cc3fcf0197739e2c258b32083cc5da4"
+    TFRT_COMMIT = "7c586554ca927be65ee8b3bfa891998b68591edf"
+    TFRT_SHA256 = "35d278957bd8fea1e583e6b977d389e5f4472c2cae60d7ab46f76a3a49c380a1"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 3b63e3a6949a02..4554463cb90675 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -602,3 +602,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index 1b540ed47fb4d3..bfb4634e810328 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -1,11 +1,11 @@
 """Docker images used with remote config and RBE."""
 
-"""SHA 256 values for each image."""
+# SHA 256 values for each image.
 container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:77234e5750afcf85c08e8980eff2e8c58ba207a0c32b06a372cafb687d144d2b",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index d8c95c340867b7..bae6b50d80c1d8 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -4,6 +4,7 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl:tsl.bzl", "set_external_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
@@ -96,5 +97,18 @@ cc_library(
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:types",
+        "@com_google_absl//absl/numeric:bits",
+    ],
+)
+
+tsl_cc_test(
+    name = "bits_test",
+    size = "small",
+    srcs = ["bits_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bits",
+        "//tsl/platform:test",
+        "//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/bits.h b/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
index e4e2f700608c75..9a31ae5815febe 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_LIB_CORE_BITS_H_
 #define TENSORFLOW_TSL_LIB_CORE_BITS_H_
 
+#include <cstdint>
+
+#include "absl/numeric/bits.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/types.h"
 
@@ -104,6 +107,14 @@ inline uint64 NextPowerOfTwo64(uint64 value) {
   return 1LL << exponent;
 }
 
+inline int64_t NextPowerOfTwoS64(int64_t value) {
+  constexpr int64_t kMaxRepresentablePowerOfTwo =
+      static_cast<int64_t>(uint64_t{1} << 62);
+  DCHECK_GE(value, 0);
+  DCHECK_LE(value, kMaxRepresentablePowerOfTwo);
+  return static_cast<int64_t>(absl::bit_ceil(static_cast<uint64_t>(value)));
+}
+
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_LIB_CORE_BITS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/bits_test.cc b/third_party/xla/third_party/tsl/tsl/lib/core/bits_test.cc
new file mode 100644
index 00000000000000..2c38e1c0345646
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/bits_test.cc
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tsl/lib/core/bits.h"
+
+#include <cstdint>
+
+#include "tsl/platform/test.h"
+
+namespace tsl {
+namespace {
+
+TEST(BitsTest, NextPowerOfTwoS64) {
+  constexpr int64_t kMaxRepresentablePowerOfTwo =
+      static_cast<int64_t>(uint64_t{1} << 62);
+  EXPECT_EQ(NextPowerOfTwoS64(0), 1);
+  EXPECT_EQ(NextPowerOfTwoS64(1), 1);
+  EXPECT_EQ(NextPowerOfTwoS64(2), 2);
+  EXPECT_EQ(NextPowerOfTwoS64(3), 4);
+  EXPECT_EQ(NextPowerOfTwoS64(kMaxRepresentablePowerOfTwo - 1),
+            kMaxRepresentablePowerOfTwo);
+  EXPECT_EQ(NextPowerOfTwoS64(kMaxRepresentablePowerOfTwo),
+            kMaxRepresentablePowerOfTwo);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index 0c3e736a915fb3..99d4b2ffc04171 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -119,7 +119,6 @@ cc_library(
         "//tsl/lib/gtl:map_util",
         "//tsl/platform:logging",
         "//tsl/platform:macros",
-        "//tsl/platform:regexp",
         "//tsl/platform:types",
         "//tsl/profiler/lib:context_types_hdrs",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
index 6ccfadb61bb4d7..2cbc96bea37927 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/hash/hash.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tsl/profiler/lib/context_types.h"
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
index d2580982122764..90cee796fd95a7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
@@ -29,8 +29,10 @@ namespace profiler {
 // Support up to 500 accelerator devices.
 constexpr uint32 kFirstDeviceId = 1;
 constexpr uint32 kLastDeviceId = 500;
+// Support Upto 200 custom planes.
+constexpr uint32 kCustomPlaneDeviceId = kLastDeviceId + 1;
 // Host threads are shown as a single fake device.
-constexpr uint32 kHostThreadsDeviceId = kLastDeviceId + 1;
+constexpr uint32 kHostThreadsDeviceId = kCustomPlaneDeviceId + 200;
 
 // Constants used as trace_viewer TID (resource_id in trace_events.proto).
 constexpr int kThreadIdDerivedMin = 0xdeadbeef;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index 66131e28838a8d..2f7eb630aa324a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tsl/lib/gtl/map_util.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/regexp.h"
-#include "tsl/platform/types.h"
 #include "tsl/profiler/utils/tf_op_utils.h"
 
 namespace tsl {
@@ -70,6 +67,10 @@ constexpr int kNumHostEventTypes =
 constexpr int kNumStatTypes =
     StatType::kLastStatType - StatType::kFirstStatType + 1;
 
+constexpr int kNumMegaScaleStatTypes =
+    MegaScaleStatType::kLastMegaScaleStatType -
+    MegaScaleStatType::kFirstMegaScaleStatType + 1;
+
 constexpr int kNumLineIdTypes =
     LineIdType::kLastLineIdType - LineIdType::kFirstLineIdType + 1;
 
@@ -78,6 +79,10 @@ using HostEventTypeStrMap =
     absl::flat_hash_map<HostEventType, absl::string_view>;
 using StatTypeMap = absl::flat_hash_map<absl::string_view, StatType>;
 using StatTypeStrMap = absl::flat_hash_map<StatType, absl::string_view>;
+using MegaScaleStatTypeMap =
+    absl::flat_hash_map<absl::string_view, MegaScaleStatType>;
+using MegaScaleStatTypeStrMap =
+    absl::flat_hash_map<MegaScaleStatType, absl::string_view>;
 using LineIdTypeMap = absl::flat_hash_map<absl::string_view, LineIdType>;
 using LineIdTypeStrMap = absl::flat_hash_map<LineIdType, absl::string_view>;
 
@@ -328,6 +333,32 @@ const StatTypeMap& GetStatTypeMap() {
   return *stat_type_map;
 }
 
+const MegaScaleStatTypeMap& GetMegaScaleStatTypeMap() {
+  static auto* stat_type_map = new MegaScaleStatTypeMap({
+      {"graph_key", kMegaScaleGraphKey},
+      {"local_device_id", kMegaScaleLocalDeviceId},
+      {"num_actions", kMegaScaleNumActions},
+      {"collective_type", kMegaScaleCollectiveType},
+      {"input_size", kMegaScaleInputSize},
+      {"slack_us", kMegaScaleSlackUs},
+      {"action_type", kMegaScaleActionType},
+      {"start_end_type", kMegaScaleStartEndType},
+      {"action_index", kMegaScaleActionIndex},
+      {"action_duration_ns", kMegaScaleActionDurationNs},
+      {"action_inputs", kMegaScaleActionInputs},
+      {"transfer_source", kMegaScaleTransferSource},
+      {"transfer_destinations", kMegaScaleTransferDestinations},
+      {"buffer_sizes", kMegaScaleBufferSizes},
+      {"compute_operation", kMegaScaleComputeOperation},
+      {"chunk", kMegaScaleChunk},
+      {"launch_id", kMegaScaleLaunchId},
+      {"loop_iteration", kMegaScaleLoopIteration},
+      {"graph_protos", kMegaScaleGraphProtos},
+  });
+  DCHECK_EQ(stat_type_map->size(), kNumMegaScaleStatTypes);
+  return *stat_type_map;
+}
+
 const LineIdTypeMap& GetLineIdTypeMap() {
   static auto* line_id_type_map = new LineIdTypeMap({
       {"UnknownLineIdType", kUnknownLineIdType},
@@ -350,6 +381,12 @@ const StatTypeStrMap& GetStatTypeStrMap() {
   return *stat_type_str_map;
 }
 
+const MegaScaleStatTypeStrMap& GetMegaScaleStatTypeStrMap() {
+  static auto* stat_type_str_map = new MegaScaleStatTypeStrMap(
+      gtl::ReverseMap<MegaScaleStatTypeStrMap>(GetMegaScaleStatTypeMap()));
+  return *stat_type_str_map;
+}
+
 const LineIdTypeStrMap& GetLineIdTypeStrMap() {
   static auto* line_id_type_str_map = new LineIdTypeStrMap(
       gtl::ReverseMap<LineIdTypeStrMap>(GetLineIdTypeMap()));
@@ -393,6 +430,17 @@ std::optional<int64_t> FindStatType(absl::string_view stat_name) {
   return std::nullopt;
 }
 
+absl::string_view GetMegaScaleStatTypeStr(MegaScaleStatType stat_type) {
+  return GetMegaScaleStatTypeStrMap().at(stat_type);
+}
+
+std::optional<int64_t> FindMegaScaleStatType(absl::string_view stat_name) {
+  if (auto stat_type = gtl::FindOrNull(GetMegaScaleStatTypeMap(), stat_name)) {
+    return *stat_type;
+  }
+  return std::nullopt;
+}
+
 absl::string_view GetLineIdTypeStr(LineIdType line_id_type) {
   return GetLineIdTypeStrMap().at(line_id_type);
 }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index 634489037f0cb2..8fa320791f0ee5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/hash/hash.h"
-#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
@@ -316,6 +315,30 @@ enum StatType {
   kLastStatType = kEdgeTpuMlir,
 };
 
+enum MegaScaleStatType : uint8_t {
+  kMegaScaleGraphKey,
+  kFirstMegaScaleStatType = kMegaScaleGraphKey,
+  kMegaScaleLocalDeviceId,
+  kMegaScaleNumActions,
+  kMegaScaleCollectiveType,
+  kMegaScaleInputSize,
+  kMegaScaleSlackUs,
+  kMegaScaleActionType,
+  kMegaScaleStartEndType,
+  kMegaScaleActionIndex,
+  kMegaScaleActionDurationNs,
+  kMegaScaleActionInputs,
+  kMegaScaleTransferSource,
+  kMegaScaleTransferDestinations,
+  kMegaScaleBufferSizes,
+  kMegaScaleComputeOperation,
+  kMegaScaleChunk,
+  kMegaScaleLaunchId,
+  kMegaScaleLoopIteration,
+  kMegaScaleGraphProtos,
+  kLastMegaScaleStatType = kMegaScaleGraphProtos,
+};
+
 static constexpr uint32_t kLineIdOffset = 10000;
 
 enum LineIdType {
@@ -361,6 +384,15 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 
 std::optional<int64_t> FindStatType(absl::string_view stat_name);
 
+absl::string_view GetMegaScaleStatTypeStr(MegaScaleStatType stat_type);
+
+inline bool IsMegaScaleStatType(MegaScaleStatType stat_type,
+                                absl::string_view stat_name) {
+  return GetMegaScaleStatTypeStr(stat_type) == stat_name;
+}
+
+std::optional<int64_t> FindMegaScaleStatType(absl::string_view stat_name);
+
 // Returns true if the given event shouldn't be shown in the trace viewer.
 bool IsInternalEvent(std::optional<int64_t> event_type);
 
diff --git a/third_party/xla/third_party/tsl/workspace1.bzl b/third_party/xla/third_party/tsl/workspace1.bzl
index 4cfb6da82ec44e..2495080d804c42 100644
--- a/third_party/xla/third_party/tsl/workspace1.bzl
+++ b/third_party/xla/third_party/tsl/workspace1.bzl
@@ -3,7 +3,6 @@
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
-load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 # buildifier: disable=unnamed-macro
@@ -14,7 +13,6 @@ def workspace(with_rules_cc = True):
       with_rules_cc: whether to load and patch rules_cc repository.
     """
     native.register_toolchains("@local_config_python//:py_toolchain")
-    rules_cuda_dependencies(with_rules_cc)
     rules_pkg_dependencies()
 
     closure_repositories()
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 895c1b10a021b0..1343f9e0f03ed5 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -13,6 +13,7 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
+load("//third_party/ducc:workspace.bzl", ducc = "repo")
 load("//third_party/eigen3:workspace.bzl", eigen3 = "repo")
 load("//third_party/farmhash:workspace.bzl", farmhash = "repo")
 load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
@@ -44,6 +45,7 @@ def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
     absl()
     benchmark()
+    ducc()
     eigen3()
     farmhash()
     gemmlowp()
@@ -424,9 +426,9 @@ def _tf_repositories():
         name = "nccl_archive",
         build_file = "//third_party:nccl/archive.BUILD",
         patch_file = ["//third_party/nccl:archive.patch"],
-        sha256 = "0e3d7b6295beed81dc15002e88abf7a3b45b5c686b13b779ceac056f5612087f",
-        strip_prefix = "nccl-2.16.5-1",
-        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.16.5-1.tar.gz"),
+        sha256 = "16ac98f3e926c024ce48e10ab220e19ce734adc48c423cfd55ad6f509bd1179f",
+        strip_prefix = "nccl-2.18.5-1",
+        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.18.5-1.tar.gz"),
     )
 
     java_import_external(
diff --git a/third_party/xla/third_party/zlib.BUILD b/third_party/xla/third_party/zlib.BUILD
deleted file mode 100644
index b8ca17d13b39b1..00000000000000
--- a/third_party/xla/third_party/zlib.BUILD
+++ /dev/null
@@ -1,43 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD/MIT-like license (for zlib)
-
-cc_library(
-    name = "zlib",
-    srcs = [
-        "adler32.c",
-        "compress.c",
-        "crc32.c",
-        "crc32.h",
-        "deflate.c",
-        "deflate.h",
-        "gzclose.c",
-        "gzguts.h",
-        "gzlib.c",
-        "gzread.c",
-        "gzwrite.c",
-        "infback.c",
-        "inffast.c",
-        "inffast.h",
-        "inffixed.h",
-        "inflate.c",
-        "inflate.h",
-        "inftrees.c",
-        "inftrees.h",
-        "trees.c",
-        "trees.h",
-        "uncompr.c",
-        "zconf.h",
-        "zutil.c",
-        "zutil.h",
-    ],
-    hdrs = ["zlib.h"],
-    copts = select({
-        "@local_tsl//tsl:windows": [],
-        "//conditions:default": [
-            "-Wno-shift-negative-value",
-            "-DZ_HAVE_UNISTD_H",
-        ],
-    }),
-    includes = ["."],
-)
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 3b63e3a6949a02..4554463cb90675 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -602,3 +602,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c13559bbf5df818bb586ad0880b29c409398b56fd8cc122ab0b31dc2b2416505",
+            "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:93c234df4c781af6974d86e9d1dd2e19ce0845b1b662c38e9a30d1de64eab3b0",
+            "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:d0a91705406aad65a79011683b8f7d4b8131625ea26a6d08aa7c6eb6955873a2",
+            "sigbuild-r2.16-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:ed7313f95bce391cbf3b498ff6c534d163cc2bb91ca1d6ef6363bde4fd9e0cfc",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.2",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index 1b540ed47fb4d3..bfb4634e810328 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -1,11 +1,11 @@
 """Docker images used with remote config and RBE."""
 
-"""SHA 256 values for each image."""
+# SHA 256 values for each image.
 container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:77234e5750afcf85c08e8980eff2e8c58ba207a0c32b06a372cafb687d144d2b",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 79dc02a7966200..384e20b3422e5d 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1076,6 +1076,7 @@ cc_library(
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
             "@local_tsl//tsl/platform:logging",
+            "@local_tsl//tsl/platform:protobuf",
             "@local_tsl//tsl/util:command_line_flags",
         ],
 )
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index d3bad5a972ae2f..b177fe3e2895ea 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -73,6 +73,8 @@ message AutotuneResult {
     string exec_plan_id = 1;
   }
 
+  // If you don't need a proto in your code, please use TritonGemmConfig instead
+  // of using this proto directly.
   message TritonGemmKey {
     int64 block_m = 1;
     int64 block_n = 2;
diff --git a/third_party/xla/xla/client/lib/matrix.cc b/third_party/xla/xla/client/lib/matrix.cc
index 549a1ca3111df4..e5b060a49e290b 100644
--- a/third_party/xla/xla/client/lib/matrix.cc
+++ b/third_party/xla/xla/client/lib/matrix.cc
@@ -387,25 +387,27 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64_t> x_config,
                   xla::XlaOp y, absl::Span<const int64_t> y_config,
                   absl::Span<const int64_t> output_config,
                   xla::PrecisionConfig::Precision precision,
-                  std::optional<PrimitiveType> preferred_element_type) {
+                  std::optional<PrimitiveType> preferred_element_type,
+                  bool grad_x, bool grad_y) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     auto x_diagonal_labels = EinsumDiagonalLabels(x_config);
     if (x_diagonal_labels) {
       return Einsum(EinsumDiagonal(x, x_config), x_diagonal_labels->at(0), y,
-                    y_config, output_config, precision, preferred_element_type);
+                    y_config, output_config, precision, preferred_element_type,
+                    grad_x, grad_y);
     }
     auto y_diagonal_labels = EinsumDiagonalLabels(y_config);
     if (y_diagonal_labels) {
       return Einsum(x, x_config, EinsumDiagonal(y, y_config),
                     y_diagonal_labels->at(0), output_config, precision,
-                    preferred_element_type);
+                    preferred_element_type, grad_x, grad_y);
     }
     auto output_diagonal_labels = EinsumDiagonalLabels(output_config);
     if (output_diagonal_labels) {
       return EinsumInverseDiagonal(
           Einsum(x, x_config, y, y_config, output_diagonal_labels->at(0),
-                 precision, preferred_element_type),
+                 precision, preferred_element_type, grad_x, grad_y),
           output_config);
     }
 
@@ -549,6 +551,11 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64_t> x_config,
     precision_proto.add_operand_precision(precision);
     auto dot =
         DotGeneral(x, y, dnums, &precision_proto, preferred_element_type);
+
+    TF_RETURN_IF_ERROR(builder->SetInstructionFrontendAttribute(
+        dot, "grad_x", (grad_x ? "true" : "false")));
+    TF_RETURN_IF_ERROR(builder->SetInstructionFrontendAttribute(
+        dot, "grad_y", (grad_y ? "true" : "false")));
     dot = Transpose(dot, transpose_dims);
     if (transpose_rank == output_rank) {
       return dot;
@@ -580,7 +587,8 @@ XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision,
 
 XlaOp BatchDot(XlaOp x, bool transpose_x, XlaOp y, bool transpose_y,
                PrecisionConfig::Precision precision,
-               std::optional<PrimitiveType> preferred_element_type) {
+               std::optional<PrimitiveType> preferred_element_type, bool grad_x,
+               bool grad_y) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     std::string string("...mk,...kn->...mn");
@@ -590,7 +598,8 @@ XlaOp BatchDot(XlaOp x, bool transpose_x, XlaOp y, bool transpose_y,
     if (transpose_y) {
       std::swap(string[6 + 3], string[6 + 4]);
     }
-    return Einsum(x, y, string, precision, preferred_element_type);
+    return Einsum(x, y, string, precision, preferred_element_type, grad_x,
+                  grad_y);
   });
 }
 
@@ -711,12 +720,14 @@ std::string NormalizeEinsumString(absl::string_view einsum_config) {
 
 XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
              PrecisionConfig::Precision precision,
-             std::optional<PrimitiveType> preferred_element_type) {
+             std::optional<PrimitiveType> preferred_element_type, bool grad_x,
+             bool grad_y) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     auto new_config = NormalizeEinsumString(einsum_config);
     if (!new_config.empty()) {
-      return Einsum(x, y, new_config, precision, preferred_element_type);
+      return Einsum(x, y, new_config, precision, preferred_element_type, grad_x,
+                    grad_y);
     }
     TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
     TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
@@ -724,7 +735,8 @@ XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
         auto einsum_config_numeric,
         ParseEinsumString(einsum_config, x_shape.rank(), y_shape.rank()));
     return Einsum(x, einsum_config_numeric[0], y, einsum_config_numeric[1],
-                  einsum_config_numeric[2], precision, preferred_element_type);
+                  einsum_config_numeric[2], precision, preferred_element_type,
+                  grad_x, grad_y);
   });
 }
 
diff --git a/third_party/xla/xla/client/lib/matrix.h b/third_party/xla/xla/client/lib/matrix.h
index bead189147d3b4..48f75b2e650b6a 100644
--- a/third_party/xla/xla/client/lib/matrix.h
+++ b/third_party/xla/xla/client/lib/matrix.h
@@ -97,7 +97,8 @@ xla::XlaOp BatchDot(
 xla::XlaOp BatchDot(
     xla::XlaOp x, bool transpose_x, xla::XlaOp y, bool transpose_y,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
-    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
 
 // Parse an einsum string into dimension numbers:
 //   "ab,cb->ac"
@@ -128,12 +129,12 @@ std::string NormalizeEinsumString(absl::string_view einsum_config);
 xla::XlaOp Einsum(
     xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
-    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
 xla::XlaOp Einsum(
     xla::XlaOp x, absl::string_view einsum_config,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
-
 // Same as above but supporting numeric labels on dimensions. So "ab,cb->ac"
 // becomes:
 //   x_config = {0, 1}
@@ -143,7 +144,8 @@ xla::XlaOp Einsum(
     xla::XlaOp x, absl::Span<const int64_t> x_config, xla::XlaOp y,
     absl::Span<const int64_t> y_config, absl::Span<const int64_t> output_config,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT,
-    std::optional<PrimitiveType> preferred_element_type = std::nullopt);
+    std::optional<PrimitiveType> preferred_element_type = std::nullopt,
+    bool grad_x = false, bool grad_y = false);
 
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index ca8b263be618e1..6008b251d8ee18 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -25,11 +25,16 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "xla/debug_options_parsers.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/xla.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/util/command_line_flags.h"
 
 namespace xla {
@@ -93,9 +98,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // flag.
   opts.set_xla_gpu_enable_cublaslt(false);
 
-  // TODO(b/258036887): Create separate flags for enabling cuBLAS, cuDNN, and
-  // NCCL in GPU graphs.
-  opts.set_xla_gpu_graph_level(1);
+  opts.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
   opts.set_xla_gpu_graph_num_runs_to_instantiate(-1);
   opts.set_xla_gpu_enable_persistent_temp_buffers(false);
   opts.set_xla_gpu_graph_min_graph_size(5);
@@ -150,6 +153,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_lhs_enable_gpu_async_tracker(true);
   opts.set_xla_gpu_enable_analytical_latency_estimator(false);
   opts.set_xla_gpu_pgle_profile_file_or_directory_path("");
+  opts.set_xla_gpu_memory_limit_slop_factor(95);
   opts.set_xla_gpu_enable_highest_priority_async_stream(true);
 
   opts.set_xla_gpu_enable_pipelined_collectives(false);
@@ -203,6 +207,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_ensure_minor_dot_contraction_dims(false);
   opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(true);
   opts.set_xla_gpu_llvm_verification_level(0);
+  opts.set_xla_gpu_enable_cub_radix_sort(false);
 
   return opts;
 }
@@ -355,6 +360,46 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
         return true;
       };
 
+  // Custom "sub-parser" lambda for xla_gpu_graph_level.
+  auto setter_for_xla_gpu_graph_level = [debug_options](const int32_t level) {
+    debug_options->clear_xla_gpu_enable_command_buffer();
+    if (level >= 1) {
+      debug_options->add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+    }
+    if (level >= 2) {
+      debug_options->add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
+    }
+    if (level >= 3) {
+      debug_options->add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
+    }
+    return true;
+  };
+
+  auto command_types_to_string =
+      [](tsl::protobuf::RepeatedField<int> command_types) -> std::string {
+    struct Formatter {
+      void operator()(std::string* out, int type) const {
+        absl::StrAppend(out, DebugOptions::CommandBufferCmdType_Name(type));
+      }
+    };
+    return absl::StrJoin(command_types, ", ", Formatter());
+  };
+
+  // Custom "sub-parser" lambda for xla_gpu_enable_command_buffer.
+  auto setter_for_xla_gpu_enable_command_buffer =
+      [debug_options](const std::string& values) {
+        debug_options->clear_xla_gpu_enable_command_buffer();
+        for (const absl::string_view value : absl::StrSplit(values, ',')) {
+          DebugOptions::CommandBufferCmdType cmd_type;
+          if (!DebugOptions::CommandBufferCmdType_Parse(
+                  absl::AsciiStrToUpper(value), &cmd_type)) {
+            return false;
+          }
+          debug_options->add_xla_gpu_enable_command_buffer(cmd_type);
+        }
+        return true;
+      };
+
   // Custom "sub-parser" for xla_fuel.  Note that ConsumeFuel does not do any
   // locking on the fuel global variables.  This means that it's
   // illegal/undefined behavior to modify this flag value while the compiler is
@@ -943,11 +988,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_enable_cublaslt(),
                 "Use cuBLASLt for GEMMs when possible."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_graph_level",
-      int32_setter_for(&DebugOptions::set_xla_gpu_graph_level),
-      debug_options->xla_gpu_graph_level(),
-      "Set GPU graph level. 0 = off; 1 = capture fusions and memcpys; 2 = "
-      "capture gemms; 3 = capture convolutions."));
+      "xla_gpu_graph_level", setter_for_xla_gpu_graph_level, 1,
+      "The legacy flag for setting GPU graph level. Use "
+      "xla_gpu_enable_command_buffer in new use cases. 0 = off; 1 = capture "
+      "fusions and memcpys; 2 = capture gemms; 3 = capture convolutions."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_command_buffer", setter_for_xla_gpu_enable_command_buffer,
+      command_types_to_string(debug_options->xla_gpu_enable_command_buffer()),
+      "The types of the commands that are recorded into command buffers"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_graph_num_runs_to_instantiate",
       int32_setter_for(
@@ -1131,6 +1179,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_lhs_enable_gpu_async_tracker),
       debug_options->xla_gpu_lhs_enable_gpu_async_tracker(),
       "Enable GPU async tracker for latency-hiding scheduler in XLA:GPU"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_memory_limit_slop_factor",
+      int32_setter_for(&DebugOptions::set_xla_gpu_memory_limit_slop_factor),
+      debug_options->xla_gpu_memory_limit_slop_factor(),
+      "Slop factor for memory limits in XLA:GPU"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_highest_priority_async_stream",
       bool_setter_for(
@@ -1342,6 +1395,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_llvm_verification_level(),
       "Sets how often we verify the generated llvm modules. Higher "
       "levels mean more frequent verification. Currently supported: 0, 1."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_cub_radix_sort",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_cub_radix_sort),
+      debug_options->xla_gpu_enable_cub_radix_sort(),
+      "Enable radix sort using CUB for simple shapes"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
new file mode 100644
index 00000000000000..631f54bf80b5d7
--- /dev/null
+++ b/third_party/xla/xla/ffi/BUILD
@@ -0,0 +1,71 @@
+load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "api",
+    hdrs = ["//xla/ffi/api:api_headers"],
+    visibility = ["//visibility:public"],
+    deps = ["//xla/ffi/api:c_api"],
+)
+
+cc_library(
+    name = "call_frame",
+    srcs = ["call_frame.cc"],
+    hdrs = ["call_frame.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "ffi",
+    srcs = ["ffi.cc"],
+    hdrs = ["ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":api",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_cc_test(
+    name = "ffi_test",
+    srcs = ["ffi_test.cc"],
+    deps = [
+        ":api",
+        ":call_frame",
+        ":ffi",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi/api:c_api",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/ffi/README.md b/third_party/xla/xla/ffi/README.md
new file mode 100644
index 00000000000000..cea97f3e0db32d
--- /dev/null
+++ b/third_party/xla/xla/ffi/README.md
@@ -0,0 +1,23 @@
+# XLA FFI
+
+This is the next generation of XLA custom calls with rich type-safe APIs.
+
+https://en.wikipedia.org/wiki/Foreign_function_interface
+
+```
+A foreign function interface (FFI) is a mechanism by which a program written in
+one programming language can call routines or make use of services written or
+compiled in another one. An FFI is often used in contexts where calls are made
+into binary dynamic-link library.
+```
+
+XLA FFI is a mechanism by which an XLA program can call functions compiled with
+another programming language using a stable C API (which guarantees ABI
+compatibility between XLA and external functions). XLA FFI also provides a C++
+header-only library that hides all the details of underlying C API from the
+user.
+
+**WARNING:** Under construction. We already have a rich type-safe custom call
+mechanism for XLA runtime. However, it doesn't provide a stable C API. XLA FFI
+aims to replicate the usability of XLA runtime's custom calls with a stable
+C API.
\ No newline at end of file
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
new file mode 100644
index 00000000000000..d18f89ed489303
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -0,0 +1,48 @@
+load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "api_headers",
+    srcs = ["api.h"],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "c_api_headers",
+    srcs = ["c_api.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "api",
+    hdrs = [":api_headers"],
+    visibility = ["//visibility:public"],
+    deps = [":c_api"],
+)
+
+cc_library(
+    name = "c_api",
+    hdrs = ["c_api.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "c_api_internal",
+    hdrs = ["c_api_internal.h"],
+    visibility = ["//visibility:public"],
+    deps = [":c_api"],
+)
+
+cc_library(
+    name = "ffi",
+    hdrs = ["ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":api",
+        ":c_api",
+    ],
+)
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
new file mode 100644
index 00000000000000..ed2f24b93e63aa
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -0,0 +1,554 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_API_H_
+#define XLA_FFI_API_API_H_
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// This is a header-only base C++ library that defines templates for decoding
+// XLA FFI call frames and invoking corresponding C++ functions. This must have
+// no dependencies outside of the C++ standard library.
+//
+// There are two extensions to this base library:
+//
+//   (1) xla/ffi/api/ffi.h for defining "external" FFI handlers loaded from
+//       dynamic libraries potentially built with different toolchains and/or
+//       a different XLA commit. It is a header-only library without any
+//       dependencies.
+//
+//   (2) xla/ffi/ffi.h for defining "internal" FFI handlers that must be
+//       statically linked into the binary and must be built from the same
+//       commit using the same toolchain, as it provides access to XLA
+//       implementation details (e.g. ServiceExecutableOptions) and C++ ABI
+//       across different libraries is hard.
+//
+// Extensions define template specializations for argument-decoding hooks
+// defined in this file.
+
+#include "xla/ffi/api/c_api.h"
+
+namespace xla::ffi {
+
+// Forward declare template defined below.
+template <typename... Ts>
+class Binding;
+
+// Forward declare template defined below.
+template <typename Fn, typename... Ts>
+class Handler;
+
+//===----------------------------------------------------------------------===//
+// XLA FFI virtual base for implementing FFI handlers
+//===----------------------------------------------------------------------===//
+
+class Ffi {
+ public:
+  static Binding<> Bind();
+
+  virtual ~Ffi() = default;
+  virtual XLA_FFI_Error* Call(const XLA_FFI_CallFrame* call_frame) const = 0;
+
+  // Registers handler with an XLA runtime under the given name.
+  static inline XLA_FFI_Error* RegisterStaticHandler(const XLA_FFI_Api* api,
+                                                     std::string_view name,
+                                                     XLA_FFI_Handler* handler);
+
+ protected:
+  template <typename... Args>
+  static std::string StrCat(Args... args);
+
+  static inline XLA_FFI_Error* MakeError(const XLA_FFI_Api* api,
+                                         XLA_FFI_Error_Code errc,
+                                         std::string message);
+
+  static inline XLA_FFI_Error* InvalidArgument(const XLA_FFI_Api* api,
+                                               std::string message);
+
+  static inline XLA_FFI_Error* CheckStructSize(const XLA_FFI_Api* api,
+                                               std::string_view struct_name,
+                                               size_t expected, size_t actual);
+};
+
+XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
+                                          std::string_view name,
+                                          XLA_FFI_Handler* handler) {
+  std::string name_str(name);  // make a copy to guarantee it's null terminated
+
+  XLA_FFI_Handler_Register_Args args;
+  args.struct_size = XLA_FFI_Handler_Register_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.name = name_str.c_str();
+  args.handler = handler;
+  return api->XLA_FFI_Handler_Register(&args);
+}
+
+template <typename... Args>
+std::string Ffi::StrCat(Args... args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  return ss.str();
+}
+
+XLA_FFI_Error* Ffi::MakeError(const XLA_FFI_Api* api, XLA_FFI_Error_Code errc,
+                              std::string message) {
+  XLA_FFI_Error_Create_Args args;
+  args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.errc = errc;
+  args.message = message.c_str();
+  return api->XLA_FFI_Error_Create(&args);
+}
+
+XLA_FFI_Error* Ffi::InvalidArgument(const XLA_FFI_Api* api,
+                                    std::string message) {
+  return MakeError(api, XLA_FFI_Error_Code_INVALID_ARGUMENT,
+                   std::move(message));
+}
+
+XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
+                                    std::string_view struct_name,
+                                    size_t expected, size_t actual) {
+  if (expected != actual) {
+    return InvalidArgument(
+        api, StrCat("Unexpected ", struct_name, " size: expected ", expected,
+                    " got ", actual, ". Check installed software versions."));
+  }
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Type tags for distinguishing handler argument types
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// A type tag to distinguish arguments tied to the attributes in the
+// `Binding` variadic template argument.
+template <typename T>
+struct AttrTag {};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Binding variadic template defines FFI handler signature
+//===----------------------------------------------------------------------===//
+
+template <typename... Ts>
+class Binding {
+ public:
+  template <typename T>
+  Binding<Ts..., T> Arg() && {
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  Binding<Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
+    attrs_.push_back(std::move(attr));
+    return {std::move(*this)};
+  }
+
+  template <typename Fn>
+  std::unique_ptr<Handler<Fn, Ts...>> To(Fn fn) {
+    return std::unique_ptr<Handler<Fn, Ts...>>(
+        new Handler<Fn, Ts...>(std::forward<Fn>(fn), std::move(attrs_)));
+  }
+
+ private:
+  template <typename...>
+  friend class Binding;
+  friend class Ffi;
+
+  explicit Binding() {
+    static_assert(sizeof...(Ts) == 0, "arguments must be empty");
+  }
+
+  template <typename... TTs>
+  Binding(Binding<TTs...>&& other)  // NOLINT
+      : attrs_(std::move(other.attrs_)) {}
+
+  Binding(Binding&) = delete;
+
+  std::vector<std::string> attrs_;  // names of bound attributes
+};
+
+inline Binding<> Ffi::Bind() { return xla::ffi::Binding<>(); }
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI arguments decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` arguments
+//
+//   template <>
+//   struct ArgDecoding<MyType> {
+//     static std::optional<MyType> Decode(XLA_FFI_ArgType type, void* arg);
+//   };
+//
+// If argument can't be decoded it should return the empty optional.
+template <typename T>
+struct ArgDecoding;
+
+//===----------------------------------------------------------------------===//
+// Attributes decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI attribute decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` attributes
+//
+//   template <>
+//   struct AttrDecoding<MyType> {
+//    static std::optional<MyType> Decode(std::string_view name,
+//                                        XLA_FFI_AttrType type, void* attr);
+//   }
+//
+template <typename T>
+struct AttrDecoding;
+
+//===----------------------------------------------------------------------===//
+// Result encoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI result encoding (conversion from a returned status-like type to FFI
+// error type) must be defined by specializing this template.
+//
+// Example: encoding `absl::Status` result
+//
+//   template<>
+//   struct ResultEncoding<absl::Status> {
+//     XLA_FFI_Error* Encode(const XLA_FFI_Api* api, absl::Status status) {...}
+//   }
+//
+template <typename T>
+struct ResultEncoding;
+
+//===----------------------------------------------------------------------===//
+// Decoding arguments and attributes
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// When decoding input data we need to keep track of how many arguments and
+// attributes we decoded so far to compute call frame offsets.
+struct DecodingOffsets {
+  int64_t args = 0;
+  int64_t attrs = 0;
+};
+
+struct DecodingContext {
+  const XLA_FFI_CallFrame* call_frame;
+
+  const std::string* attrs_names;  // not owned
+  const std::size_t* attrs_idx;    // not owned
+};
+
+template <typename T>
+struct Decode {
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx) {
+    int64_t idx = offsets.args++;
+    return ArgDecoding<T>::Decode(ctx.call_frame->args.types[idx],
+                                  ctx.call_frame->args.args[idx]);
+  }
+};
+
+template <typename T>
+struct Decode<internal::AttrTag<T>> {
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx) {
+    // Find decoded attribute corresponding to the given attribute index.
+    int64_t idx = offsets.attrs++;
+
+    // Get mapping from the attribute to its index in the sorted array.
+    size_t i = ctx.attrs_idx[idx];
+
+    // Load attribute from call frame using index into the sorted array.
+    XLA_FFI_AttrType type = ctx.call_frame->attrs.types[i];
+    XLA_FFI_ByteSpan* name = ctx.call_frame->attrs.names[i];
+    void* attr = ctx.call_frame->attrs.attrs[i];
+
+    // TODO(ezhulenev): Currently we require that attributes passed to the FFI
+    // handler must match attributes referenced in a binding, however
+    // we could safely ignore extra attributes. Relax this if needed.
+
+    // Attribute name does not match.
+    std::string_view name_view = {name->ptr, name->len};
+    if (name_view != ctx.attrs_names[idx]) return std::nullopt;
+
+    return AttrDecoding<T>::Decode(name_view, type, attr);
+  }
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Template metaprogramming for decoding handler signature
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// A helper struct to extract the type of the handler argument.
+template <typename T>
+struct FnArgType {
+  using Type = T;
+};
+
+// Extracts the underlying type from the attribute type tag.
+template <typename T>
+struct FnArgType<internal::AttrTag<T>> {
+  using Type = T;
+};
+
+// A template for checking if type is a wrapped attribute or user data.
+template <typename>
+struct IsWrapped : std::false_type {};
+template <typename T>
+struct IsWrapped<AttrTag<T>> : std::true_type {};
+
+// A template for counting regular arguments in the Ts pack.
+template <typename... Ts>
+struct NumArgs;
+
+template <>
+struct NumArgs<> {
+  static constexpr int64_t value = 0;
+};
+
+template <typename T, typename... Ts>
+struct NumArgs<T, Ts...> {
+  static constexpr int64_t value = !IsWrapped<T>::value + NumArgs<Ts...>::value;
+};
+
+// A template for counting tagged arguments in the Ts pack (i.e. attributes).
+template <template <typename> class Tag, typename... Ts>
+struct NumTagged;
+
+template <template <typename> class Tag>
+struct NumTagged<Tag> {
+  static constexpr int64_t value = 0;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, Tag<T>, Ts...> {
+  static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, T, Ts...> {
+  static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Handler decodes FFI call frame and invokes `Fn` with decoded arguments
+//===----------------------------------------------------------------------===//
+
+template <typename Fn, typename... Ts>
+class Handler : public Ffi {
+  static constexpr int64_t kSize = sizeof...(Ts);
+
+  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+  static constexpr int64_t kNumAttrs =
+      internal::NumTagged<internal::AttrTag, Ts...>::value;
+
+  template <typename T>
+  using FnArgType = typename internal::FnArgType<T>::Type;
+
+  static_assert(std::is_invocable_v<Fn, FnArgType<Ts>...>,
+                "FFI binding signature is not compatible with a function type");
+
+  using ResultType = std::invoke_result_t<Fn, FnArgType<Ts>...>;
+
+ public:
+  XLA_FFI_Error* Call(const XLA_FFI_CallFrame* call_frame) const override {
+    // Sanity checking call frame struct size.
+    if (auto* err = CheckStructSize(call_frame->api, "XLA_FFI_CallFrame",
+                                    XLA_FFI_CallFrame_STRUCT_SIZE,
+                                    call_frame->struct_size))
+      return err;
+
+    // Check that the number of passed arguments matches the signature. Each
+    // individual argument decoding will check the actual type.
+    if (call_frame->args.num_args != kNumArgs) {
+      return InvalidArgument(
+          call_frame->api,
+          StrCat("Wrong number of arguments: expected ", kNumArgs, " but got ",
+                 call_frame->args.num_args));
+    }
+
+    // Check that the number of passed attributes matches the signature. Each
+    // individual attribute decoding will check the actual type.
+    if (call_frame->attrs.num_attrs != kNumAttrs) {
+      return InvalidArgument(
+          call_frame->api,
+          StrCat("Wrong number of attributes: expected ", kNumAttrs,
+                 " but got ", call_frame->attrs.num_attrs));
+    }
+
+    // Define index sequences to access custom call operands.
+    using Is = std::make_index_sequence<kSize>;
+
+    return Call(call_frame, Is{});
+  }
+
+ private:
+  template <size_t... Is>
+  XLA_FFI_Error* Call(const XLA_FFI_CallFrame* call_frame,
+                      std::index_sequence<Is...>) const {
+    // A helper structure to allow each decoder find the correct offset.
+    internal::DecodingOffsets offsets;
+
+    // Package all the data required for decoding ffi handler operands.
+    internal::DecodingContext ctx = {call_frame, attrs_.data(),
+                                     attrs_idx_.data()};
+
+    std::tuple<std::optional<FnArgType<Ts>>...> args = {
+        internal::Decode<Ts>::call(offsets, ctx)...};
+
+    bool all_decoded = (std::get<Is>(args).has_value() && ...);
+    if (!all_decoded) {
+      return FailedDecodeError(call_frame, {std::get<Is>(args).has_value()...});
+    }
+
+    auto result = fn_(std::move(*std::get<Is>(args))...);
+    return ResultEncoding<ResultType>::Encode(call_frame->api,
+                                              std::move(result));
+  }
+
+  XLA_FFI_Error* FailedDecodeError(const XLA_FFI_CallFrame* call_frame,
+                                   std::array<bool, kSize> decoded) const {
+    std::string message =
+        "Failed to decode all FFI handler operands (bad operands at: ";
+    for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
+      if (!decoded[idx]) {
+        if (cnt++) message.append(", ");
+        message.append(std::to_string(idx));
+      }
+    }
+    message.append(")");
+    return InvalidArgument(call_frame->api, message);
+  }
+
+  template <typename...>
+  friend class Binding;
+
+  Handler(Fn fn, std::vector<std::string> attrs)
+      : fn_(std::move(fn)), attrs_(std::move(attrs)) {
+    // Sort attributes' names and remove duplicates. These unique attributes are
+    // what we'll be looking for in the call frame attributes.
+    std::vector<std::string> sorted = attrs_;
+    std::sort(sorted.begin(), sorted.end());
+    sorted.erase(
+        std::unique(sorted.begin(), sorted.end(), std::equal_to<std::string>()),
+        sorted.end());
+
+    // Find index of every attribute in the sorted attributes vector.
+    for (size_t i = 0; i < attrs_.size(); ++i) {
+      attrs_idx_.push_back(std::distance(
+          sorted.begin(), std::find(sorted.begin(), sorted.end(), attrs_[i])));
+    }
+  }
+
+  Fn fn_;
+
+  std::vector<std::string> attrs_;  // names of bound attributes
+
+  // A mapping from the attribute index (index into the `attrs_` member) to its
+  // index in the lexicographically sorted vector of attribute names. Call frame
+  // passes attributes sorted by name, and with this index we can find the
+  // attribute we are looking for using O(1) lookup, assuming if the call frame
+  // has exact same attributes as the binding. If not, this allows to do a more
+  // efficient binary search by skipping a part of the call frame attributes.
+  std::vector<size_t> attrs_idx_;
+};
+
+//===----------------------------------------------------------------------===//
+// Builtin attributes decoding
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                  \
+  template <>                                                           \
+  struct AttrDecoding<T> {                                              \
+    static std::optional<T> Decode(std::string_view name,               \
+                                   XLA_FFI_AttrType type, void* attr) { \
+      if (type != TYPE) {                                               \
+        return std::nullopt;                                            \
+      }                                                                 \
+                                                                        \
+      return *reinterpret_cast<T*>(attr);                               \
+    }                                                                   \
+  }
+
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_AttrType_I32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_AttrType_F32);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
+template <>
+struct AttrDecoding<std::string_view> {
+  static std::optional<std::string_view> Decode(std::string_view name,
+                                                XLA_FFI_AttrType type,
+                                                void* attr) {
+    if (type != XLA_FFI_AttrType_STRING) {
+      return std::nullopt;
+    }
+
+    auto* span = reinterpret_cast<XLA_FFI_ByteSpan*>(attr);
+    return std::string_view(span->ptr, span->len);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Helper macro for registering FFI implementations
+//===----------------------------------------------------------------------===//
+
+// Use captureless lambda to function pointer conversion to create a static
+// XLA_FFI_Handler function pointer variable.
+#define XLA_FFI_DEFINE_HANDLER(fn, impl, binding)                             \
+  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) { \
+    static auto* handler = binding.To(impl).release();                        \
+    return handler->Call(call_frame);                                         \
+  }
+
+// TODO(ezhulenev): Add a callback so that end users can log registration error
+// to appropriate logging destination, e.g. LOG(FATAL) for duplicate internal
+// FFI handlers.
+#define XLA_FFI_REGISTER_HANDLER(API, NAME, FUNC) \
+  XLA_FFI_REGISTER_HANDLER_(API, NAME, FUNC, __COUNTER__)
+#define XLA_FFI_REGISTER_HANDLER_(API, NAME, FUNC, N)                         \
+  static const XLA_FFI_Error* xla_ffi_static_handler_##N##_registered_ = [] { \
+    return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, FUNC);           \
+  }();                                                                        \
+  (void)xla_ffi_static_handler_##N##_registered_
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_API_API_H_
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
new file mode 100644
index 00000000000000..55d92827558218
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -0,0 +1,255 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_C_API_H_
+#define XLA_FFI_API_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// XLA FFI C API follows PJRT API style for consistency. See `pjrt_c_api.h`.
+
+// Every struct passed across the C API boundary has its size as a member, and
+// we use it as a sanity check for API compatibility.
+#define XLA_FFI_STRUCT_SIZE(struct_type, last_field) \
+  (offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field))
+
+#define XLA_FFI_DEFINE_STRUCT_TRAITS(sname, last_field) \
+  typedef struct sname sname;                           \
+  enum { sname##_STRUCT_SIZE = XLA_FFI_STRUCT_SIZE(sname, last_field) }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct XLA_FFI_Api XLA_FFI_Api;                  // Forward declare
+typedef struct XLA_FFI_InternalApi XLA_FFI_InternalApi;  // Forward declare
+
+//===----------------------------------------------------------------------===//
+// Version
+//===----------------------------------------------------------------------===//
+
+// Incremented when an ABI-incompatible change is made to the interface.
+//
+// Major changes include:
+// * Deleting a method or argument
+// * Changing the type of an argument
+// * Rearranging fields in the XLA_FFI_Api or argument structs
+#define XLA_FFI_API_MAJOR 0
+
+// Incremented when the interface is updated in a way that is potentially
+// ABI-compatible with older versions, if supported by the caller and/or
+// implementation.
+//
+// Callers can implement forwards compatibility by using XLA_FFI_Api_Version to
+// check if the implementation is aware of newer interface additions.
+//
+// Implementations can implement backwards compatibility by using the
+// `struct_size` fields to detect how many struct fields the caller is aware of.
+//
+// Minor changes include:
+// * Adding a new field to the XLA_FFI_Api or argument structs
+// * Renaming a method or argument (doesn't affect ABI)
+#define XLA_FFI_API_MINOR 0
+
+struct XLA_FFI_Api_Version {
+  size_t struct_size;
+  void* priv;
+  int major_version;  // out
+  int minor_version;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api_Version, minor_version);
+
+//===----------------------------------------------------------------------===//
+// Error codes
+//===----------------------------------------------------------------------===//
+
+// XLA FFI handler must return an XLA_FFI_Error*, which is NULL if there is no
+// error and set if there is. Caller allocates any returned XLA_FFI_Errors, and
+// the XLA FFI is responsible for freeing them.
+typedef struct XLA_FFI_Error XLA_FFI_Error;
+
+// Codes are based on https://abseil.io/docs/cpp/guides/status-codes
+typedef enum {
+  XLA_FFI_Error_Code_CANCELLED = 1,
+  XLA_FFI_Error_Code_UNKNOWN = 2,
+  XLA_FFI_Error_Code_INVALID_ARGUMENT = 3,
+  XLA_FFI_Error_Code_DEADLINE_EXCEEDED = 4,
+  XLA_FFI_Error_Code_NOT_FOUND = 5,
+  XLA_FFI_Error_Code_ALREADY_EXISTS = 6,
+  XLA_FFI_Error_Code_PERMISSION_DENIED = 7,
+  XLA_FFI_Error_Code_RESOURCE_EXHAUSTED = 8,
+  XLA_FFI_Error_Code_FAILED_PRECONDITION = 9,
+  XLA_FFI_Error_Code_ABORTED = 10,
+  XLA_FFI_Error_Code_OUT_OF_RANGE = 11,
+  XLA_FFI_Error_Code_UNIMPLEMENTED = 12,
+  XLA_FFI_Error_Code_INTERNAL = 13,
+  XLA_FFI_Error_Code_UNAVAILABLE = 14,
+  XLA_FFI_Error_Code_DATA_LOSS = 15,
+  XLA_FFI_Error_Code_UNAUTHENTICATED = 16
+} XLA_FFI_Error_Code;
+
+//===----------------------------------------------------------------------===//
+// Error reporting APIs
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Error_Create_Args {
+  size_t struct_size;
+  void* priv;
+  const char* message;
+  XLA_FFI_Error_Code errc;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_Create_Args, errc);
+
+typedef XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args);
+
+//===----------------------------------------------------------------------===//
+// Builtin argument types
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Buffer {
+  size_t struct_size;
+  void* priv;
+
+  void* data;
+  uint8_t primitive_type;
+  int64_t rank;
+  int64_t* dims;  // length == rank
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Buffer, dims);
+
+typedef enum {
+  XLA_FFI_ArgType_BUFFER = 1,
+} XLA_FFI_ArgType;
+
+//===----------------------------------------------------------------------===//
+// Builtin attribute types
+//===----------------------------------------------------------------------===//
+
+typedef enum {
+  XLA_FFI_AttrType_I32 = 1,
+  XLA_FFI_AttrType_F32 = 2,
+  XLA_FFI_AttrType_STRING = 3,
+} XLA_FFI_AttrType;
+
+//===----------------------------------------------------------------------===//
+// Execution context
+//===----------------------------------------------------------------------===//
+
+// Execution context provides access to per-invocation state.
+typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
+
+//===----------------------------------------------------------------------===//
+// Call frame
+//===----------------------------------------------------------------------===//
+
+// We use byte spans to pass strings to handlers because strings might not be
+// null terminated, and even if they are, looking for a null terminator can
+// become very expensive in tight loops.
+struct XLA_FFI_ByteSpan {
+  size_t struct_size;
+  void* priv;
+
+  const char* ptr;
+  size_t len;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ByteSpan, len);
+
+struct XLA_FFI_Args {
+  size_t struct_size;
+  void* priv;
+
+  int64_t num_args;
+  XLA_FFI_ArgType* types;  // length == num_args
+  void** args;             // length == num_args
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Args, args);
+
+// FFI handler attributes are always sorted by name, so that the handler can
+// rely on binary search to look up attributes by name.
+struct XLA_FFI_Attrs {
+  size_t struct_size;
+  void* priv;
+
+  int64_t num_attrs;
+  XLA_FFI_AttrType* types;   // length == num_attrs
+  XLA_FFI_ByteSpan** names;  // length == num_attrs
+  void** attrs;              // length == num_attrs
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Attrs, attrs);
+
+struct XLA_FFI_CallFrame {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_Api* api;
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_Args args;
+  XLA_FFI_Attrs attrs;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_CallFrame, attrs);
+
+//===----------------------------------------------------------------------===//
+// FFI handler
+//===----------------------------------------------------------------------===//
+
+// External functions registered with XLA as FFI handlers.
+typedef XLA_FFI_Error* XLA_FFI_Handler(XLA_FFI_CallFrame* call_frame);
+
+struct XLA_FFI_Handler_Register_Args {
+  size_t struct_size;
+  void* priv;
+
+  const char* name;  // null terminated
+  XLA_FFI_Handler* handler;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, handler);
+
+typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
+    XLA_FFI_Handler_Register_Args* args);
+
+//===----------------------------------------------------------------------===//
+// API access
+//===----------------------------------------------------------------------===//
+
+#define _XLA_FFI_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+struct XLA_FFI_Api {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_InternalApi* internal_api;
+
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
+};
+
+#undef _XLA_FFI_API_STRUCT_FIELD
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api, XLA_FFI_Handler_Register);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_FFI_API_C_API_H_
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
new file mode 100644
index 00000000000000..464923425abd70
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_C_API_INTERNAL_H_
+#define XLA_FFI_API_C_API_INTERNAL_H_
+
+#include "xla/ffi/api/c_api.h"
+
+// Internal XLA FFI API that gives access to XLA implementation details that
+// should be used only for implementing FFI handlers statically linked into
+// the binary. This API should be used only by XLA itself (to implement builtin
+// custom calls), or libraries tightly coupled to XLA and built from exact same
+// commit and using the same toolchain (e.g. jaxlib). Trying to use this API
+// from a dynamically loaded shared library can lead to undefined behavior and
+// likely impossible to debug run time crashes.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Because this is an internal XLA FFI API we use a slightly relaxed C API
+// style and do not track the struct size, as we expect this API to be used
+// only in statically linked binaries, and we do not need any backward or
+// forward compatibility.
+
+typedef XLA_FFI_Error* XLA_FFI_Error_Forward(void* status);
+
+//===----------------------------------------------------------------------===//
+// API access
+//===----------------------------------------------------------------------===//
+
+#define _XLA_FFI_INTERNAL_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+struct XLA_FFI_InternalApi {
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_Error_Forward);
+};
+
+#undef _XLA_FFI_INTERNAL_API_STRUCT_FIELD
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_FFI_API_C_API_INTERNAL_H_
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
new file mode 100644
index 00000000000000..1985864be1e3bf
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_API_FFI_H_
+#define XLA_FFI_API_FFI_H_
+
+#ifdef TENSORFLOW_COMPILER_XLA_FFI_FFI_H_
+#error Two different XLA FFI implementations cannot be included together
+#endif  // XLA_FFI_API_H_
+
+// IWYU pragma: begin_exports
+#include "xla/ffi/api/api.h"
+// IWYU pragma: end_exports
+
+// TODO(ezhulenev): Implement FFI arguments and attributes decoding for external
+// FFI users without any dependencies on absl or other libraries.
+
+#endif  // XLA_FFI_API_FFI_H_
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
new file mode 100644
index 00000000000000..6b2904e21aa95d
--- /dev/null
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -0,0 +1,280 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/call_frame.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/stream_executor/device_memory.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::ffi {
+
+//===----------------------------------------------------------------------===//
+// CallFrameBuilder
+//===----------------------------------------------------------------------===//
+
+void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
+                                    PrimitiveType type,
+                                    absl::Span<const int64_t> dims) {
+  args_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
+}
+
+void CallFrameBuilder::AddI32Attr(std::string name, int32_t value) {
+  attrs_.try_emplace(std::move(name), value);
+}
+
+void CallFrameBuilder::AddF32Attr(std::string name, float value) {
+  attrs_.try_emplace(std::move(name), value);
+}
+
+void CallFrameBuilder::AddStringAttr(std::string name, std::string value) {
+  attrs_.try_emplace(std::move(name), value);
+}
+
+CallFrame CallFrameBuilder::Build(XLA_FFI_Api* api,
+                                  XLA_FFI_ExecutionContext* ctx) {
+  return CallFrame(api, ctx, args_, attrs_);
+}
+
+// ------------------------    !!! !!! !!!     ------------------------------ //
+
+// WARNING: In many structs defined below we use a pattern where we declare
+// a storage (e.g. an `std::string` member) and an XLA FFI reference type
+// pointing into that storage in the same struct (XLA_FFI_ByteSpan). Extra care
+// should be taken of keeping reference type up to date, e.g. if a parent
+// struct put into an `std::vector` container, every time vector will reallocate
+// storage all reference types will become invalid.
+
+// We intentionally do not use smart pointers that would guarantee pointer
+// stability for storage, as we are trying to minimize the number of heap
+// allocations required for building a call frame.
+
+// This is a low level internal implementation detail that should not leak via
+// public header files, and can be changed at any time in the future.
+
+//----------------------------------------------------------------------------//
+// Arguments storage + reference types
+//----------------------------------------------------------------------------//
+
+struct CallFrame::Buffer {
+  std::vector<int64_t> dims;  // XLA_FFI_Buffer::dims
+
+  XLA_FFI_Buffer buffer = {XLA_FFI_Buffer_STRUCT_SIZE, nullptr};
+};
+
+struct CallFrame::Arguments {
+  explicit Arguments(size_t size) {
+    arguments.reserve(size);
+    types.reserve(size);
+    args.reserve(size);
+  }
+
+  std::vector<Buffer> arguments;
+
+  std::vector<XLA_FFI_ArgType> types;  // XLA_FFI_Args::types
+  std::vector<void*> args;             // XLA_FFI_Args::args
+
+  XLA_FFI_Args ffi_args = {XLA_FFI_Args_STRUCT_SIZE, nullptr};
+};
+
+//----------------------------------------------------------------------------//
+// Attributes storage + reference types
+//----------------------------------------------------------------------------//
+
+struct CallFrame::String {
+  std::string value;  // XLA_FFI_ByteSpan::ptr
+
+  XLA_FFI_ByteSpan span = {XLA_FFI_ByteSpan_STRUCT_SIZE, nullptr};
+};
+
+struct CallFrame::NamedAttribute {
+  String name;
+  Attribute value;
+};
+
+struct CallFrame::Attributes {
+  explicit Attributes(size_t size) {
+    attributes.reserve(size);
+    names.reserve(size);
+    types.reserve(size);
+    attrs.reserve(size);
+  }
+
+  std::vector<NamedAttribute> attributes;
+
+  std::vector<XLA_FFI_ByteSpan*> names;  // XLA_FFI_Attributes::names
+  std::vector<XLA_FFI_AttrType> types;   // XLA_FFI_Attributes::types
+  std::vector<void*> attrs;              // XLA_FFI_Attributes::attrs
+
+  XLA_FFI_Attrs ffi_attrs = {XLA_FFI_Attrs_STRUCT_SIZE, nullptr};
+};
+
+//===----------------------------------------------------------------------===//
+// CallFrame
+//===----------------------------------------------------------------------===//
+
+CallFrame::CallFrame(XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+                     absl::Span<const CallFrameBuilder::Buffer> args,
+                     const CallFrameBuilder::AttributesMap& attrs)
+    : arguments_(InitArgs(args)), attributes_(InitAttrs(attrs)) {
+  call_frame_.api = api;
+  call_frame_.ctx = ctx;
+  call_frame_.args = arguments_->ffi_args;
+  call_frame_.attrs = attributes_->ffi_attrs;
+}
+
+CallFrame::~CallFrame() = default;
+
+//===----------------------------------------------------------------------===//
+// Call frame arguments
+//===----------------------------------------------------------------------===//
+
+/*static*/ std::unique_ptr<CallFrame::Arguments> CallFrame::InitArgs(
+    absl::Span<const CallFrameBuilder::Buffer> bargs) {
+  auto res = std::make_unique<Arguments>(bargs.size());
+
+  // Convert call frame builder arguments to call frame arguments.
+  for (const CallFrameBuilder::Buffer& barg : bargs) {
+    Buffer buffer;
+    buffer.dims = barg.dims;
+    buffer.buffer.data = const_cast<void*>(barg.memory.opaque());
+    buffer.buffer.primitive_type = static_cast<uint8_t>(barg.type);
+    buffer.buffer.rank = buffer.dims.size();
+    res->arguments.push_back(std::move(buffer));
+  }
+
+  // Fix up pointers in XLA FFI structs.
+  for (CallFrame::Buffer& arg : res->arguments) {
+    arg.buffer.dims = arg.dims.data();
+  }
+
+  // Initialize vectors required for building XLA_FFI_Args.
+  for (CallFrame::Buffer& arg : res->arguments) {
+    res->types.push_back(XLA_FFI_ArgType_BUFFER);
+    res->args.push_back(&arg.buffer);
+  }
+
+  // Finally initialize the XLA FFI struct. At this point all storage is
+  // allocated and it's safe to grab a pointer to it.
+  res->ffi_args.num_args = res->arguments.size();
+  res->ffi_args.types = res->types.data();
+  res->ffi_args.args = res->args.data();
+
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// Call frame attributes
+//===----------------------------------------------------------------------===//
+
+/*static*/ void CallFrame::FixupString(CallFrame::String& str) {
+  str.span.ptr = str.value.data();
+  str.span.len = str.value.size();
+}
+
+// An std::visit overload set for converting CallFrameBuilder::Attribute to
+// CallFrame::Attribute.
+struct CallFrame::ConvertAttribute {
+  template <typename T>
+  CallFrame::Attribute operator()(const T& value) {
+    return value;
+  }
+
+  CallFrame::Attribute operator()(const std::string& str) {
+    return CallFrame::String{str};
+  }
+};
+
+// An std::visit overload set to fix up CallFrame::Attribute storage and
+// initialize XLA FFI structs with valid pointers into storage objects.
+struct CallFrame::FixupAttribute {
+  template <typename T>
+  void operator()(T& value) {}
+
+  void operator()(CallFrame::String& str) { FixupString(str); }
+};
+
+// An std::visit overload set to get CallFrame::Attribute XLA FFI type.
+struct CallFrame::AttributeType {
+  XLA_FFI_AttrType operator()(int32_t&) { return XLA_FFI_AttrType_I32; }
+
+  XLA_FFI_AttrType operator()(float&) { return XLA_FFI_AttrType_F32; }
+
+  XLA_FFI_AttrType operator()(CallFrame::String&) {
+    return XLA_FFI_AttrType_STRING;
+  }
+};
+
+// An std::visit overload set to get CallFrame::Attribute storage pointer.
+struct CallFrame::AttributeStorage {
+  template <typename T>
+  void* operator()(T& value) {
+    return &value;
+  }
+
+  void* operator()(CallFrame::String& str) { return &str.span; }
+};
+
+/*static*/ std::unique_ptr<CallFrame::Attributes> CallFrame::InitAttrs(
+    const CallFrameBuilder::AttributesMap& battrs) {
+  auto res = std::make_unique<Attributes>(battrs.size());
+
+  // Convert call frame builder attributes to a collection of named attributes.
+  for (auto& [name, battr] : battrs) {
+    NamedAttribute attr = {String{name}, std::visit(ConvertAttribute(), battr)};
+    res->attributes.push_back(std::move(attr));
+  }
+
+  // Sort attributes by name to enable binary search at run time.
+  absl::c_sort(res->attributes,
+               [](const NamedAttribute& a, const NamedAttribute& b) {
+                 return a.name.value < b.name.value;
+               });
+
+  // Fix up XLA FFI structs to point to correct storage.
+  for (NamedAttribute& attr : res->attributes) {
+    FixupString(attr.name);
+    std::visit(FixupAttribute{}, attr.value);
+  }
+
+  // Initialize vectors required for building XLA_FFI_Attributes.
+  for (NamedAttribute& attr : res->attributes) {
+    res->names.push_back(&attr.name.span);
+    res->types.push_back(std::visit(AttributeType(), attr.value));
+    res->attrs.push_back(std::visit(AttributeStorage(), attr.value));
+  }
+
+  // Finally initialize XLA FFI struct. At this point all storage is allocated
+  // and it's safe to grab a pointer to it.
+  res->ffi_attrs.num_attrs = res->attributes.size();
+  res->ffi_attrs.names = res->names.data();
+  res->ffi_attrs.types = res->types.data();
+  res->ffi_attrs.attrs = res->attrs.data();
+
+  return res;
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
new file mode 100644
index 00000000000000..23574628854d15
--- /dev/null
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -0,0 +1,118 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_CALL_FRAME_H_
+#define XLA_FFI_CALL_FRAME_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::ffi {
+
+// CallFrame library encodes C++ arguments using XLA FFI C API structs in a form
+// compatible with the decoding defined in `ffi/api.h`.
+
+//===----------------------------------------------------------------------===//
+// CallFrameBuilder
+//===----------------------------------------------------------------------===//
+
+class CallFrame;  // forward declare
+
+class CallFrameBuilder {
+ public:
+  CallFrame Build(XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx);
+
+  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
+                    absl::Span<const int64_t> dims);
+
+  void AddI32Attr(std::string name, int32_t value);
+  void AddF32Attr(std::string name, float value);
+  void AddStringAttr(std::string name, std::string value);
+
+ private:
+  friend class CallFrame;
+
+  using Attribute = std::variant<int32_t, float, std::string>;
+  using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
+
+  struct Buffer {
+    se::DeviceMemoryBase memory;
+    PrimitiveType type;
+    std::vector<int64_t> dims;
+  };
+
+  std::vector<Buffer> args_;
+  AttributesMap attrs_;
+};
+
+//===----------------------------------------------------------------------===//
+// CallFrame
+//===----------------------------------------------------------------------===//
+
+class CallFrame {
+ public:
+  ~CallFrame();
+
+  const XLA_FFI_CallFrame* call_frame() const { return &call_frame_; }
+
+ private:
+  friend class CallFrameBuilder;
+
+  // Declare implementation detail structs for call frame storage.
+  struct Arguments;
+  struct Attributes;
+  struct Buffer;
+  struct NamedAttribute;
+  struct String;
+
+  using Attribute = std::variant<int32_t, float, String>;
+
+  CallFrame(XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+            absl::Span<const CallFrameBuilder::Buffer> args,
+            const CallFrameBuilder::AttributesMap& attrs);
+
+  static std::unique_ptr<Arguments> InitArgs(
+      absl::Span<const CallFrameBuilder::Buffer> args);
+
+  static std::unique_ptr<Attributes> InitAttrs(
+      const CallFrameBuilder::AttributesMap& attrs);
+
+  static void FixupString(CallFrame::String& str);
+
+  std::unique_ptr<Arguments> arguments_;
+  std::unique_ptr<Attributes> attributes_;
+
+  XLA_FFI_CallFrame call_frame_ = {XLA_FFI_CallFrame_STRUCT_SIZE, nullptr};
+
+  // Declare implementation detail structs to grant access to private members.
+  struct ConvertAttribute;
+  struct FixupAttribute;
+  struct AttributeType;
+  struct AttributeStorage;
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_CALL_FRAME_H_
diff --git a/third_party/xla/xla/ffi/ffi.cc b/third_party/xla/xla/ffi/ffi.cc
new file mode 100644
index 00000000000000..aae36a4a4f0446
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi.cc
@@ -0,0 +1,193 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/ffi.h"
+
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "tsl/platform/logging.h"
+
+//===----------------------------------------------------------------------===//
+// XLA FFI C structs definition
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Error {
+  xla::Status status;
+};
+
+//===----------------------------------------------------------------------===//
+
+namespace xla::ffi {
+
+Status Unwrap(XLA_FFI_Error* error) {
+  if (error == nullptr) return absl::OkStatus();
+  Status status = std::move(error->status);
+  delete error;
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// XLA FFI registry
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): We have to support platform-specific handler registration.
+using HandlerRegistry = absl::flat_hash_map<std::string, XLA_FFI_Handler*>;
+
+static HandlerRegistry& GetHandlerRegistry() {
+  static auto* registry = new HandlerRegistry();
+  return *registry;
+}
+
+static Status RegisterHandler(std::string_view name, XLA_FFI_Handler* handler) {
+  auto emplaced = GetHandlerRegistry().try_emplace(std::string(name), handler);
+  if (!emplaced.second)
+    return absl::InvalidArgumentError(
+        absl::StrCat("Duplicate FFI handler registration for ", name));
+  return OkStatus();
+}
+
+StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name) {
+  auto it = GetHandlerRegistry().find(name);
+  if (it == GetHandlerRegistry().end())
+    return absl::NotFoundError(
+        absl::StrCat("No FFI handler registered for ", name));
+  return it->second;
+}
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api Implementation
+//===----------------------------------------------------------------------===//
+
+static std::string StructSizeErrorMsg(std::string_view struct_name,
+                                      size_t expected, size_t actual) {
+  return absl::StrCat("Unexpected ", struct_name, " size: expected ", expected,
+                      ", got ", actual, ". Check installed software versions. ",
+                      "The framework XLA FFI API version is ",
+                      XLA_FFI_API_MAJOR, ".", XLA_FFI_API_MINOR, ".");
+}
+
+static Status ActualStructSizeIsGreaterOrEqual(std::string_view struct_name,
+                                               size_t expected, size_t actual) {
+  if (actual < expected) {
+    return absl::InvalidArgumentError(
+        StructSizeErrorMsg(struct_name, expected, actual));
+  }
+  if (actual > expected) {
+    VLOG(2) << StructSizeErrorMsg(struct_name, expected, actual);
+  }
+  return absl::OkStatus();
+}
+
+static absl::StatusCode ToStatusCode(XLA_FFI_Error_Code errc) {
+  switch (errc) {
+    case XLA_FFI_Error_Code_CANCELLED:
+      return absl::StatusCode::kCancelled;
+    case XLA_FFI_Error_Code_UNKNOWN:
+      return absl::StatusCode::kUnknown;
+    case XLA_FFI_Error_Code_INVALID_ARGUMENT:
+      return absl::StatusCode::kInvalidArgument;
+    case XLA_FFI_Error_Code_DEADLINE_EXCEEDED:
+      return absl::StatusCode::kDeadlineExceeded;
+    case XLA_FFI_Error_Code_NOT_FOUND:
+      return absl::StatusCode::kNotFound;
+    case XLA_FFI_Error_Code_ALREADY_EXISTS:
+      return absl::StatusCode::kAlreadyExists;
+    case XLA_FFI_Error_Code_PERMISSION_DENIED:
+      return absl::StatusCode::kPermissionDenied;
+    case XLA_FFI_Error_Code_RESOURCE_EXHAUSTED:
+      return absl::StatusCode::kResourceExhausted;
+    case XLA_FFI_Error_Code_FAILED_PRECONDITION:
+      return absl::StatusCode::kFailedPrecondition;
+    case XLA_FFI_Error_Code_ABORTED:
+      return absl::StatusCode::kAborted;
+    case XLA_FFI_Error_Code_OUT_OF_RANGE:
+      return absl::StatusCode::kOutOfRange;
+    case XLA_FFI_Error_Code_UNIMPLEMENTED:
+      return absl::StatusCode::kUnimplemented;
+    case XLA_FFI_Error_Code_INTERNAL:
+      return absl::StatusCode::kInternal;
+    case XLA_FFI_Error_Code_UNAVAILABLE:
+      return absl::StatusCode::kUnavailable;
+    case XLA_FFI_Error_Code_DATA_LOSS:
+      return absl::StatusCode::kDataLoss;
+    case XLA_FFI_Error_Code_UNAUTHENTICATED:
+      return absl::StatusCode::kUnauthenticated;
+  }
+}
+
+#define XLA_FFI_RETURN_IF_ERROR(expr)                                   \
+  do {                                                                  \
+    Status _status = (expr);                                            \
+    if (!_status.ok()) {                                                \
+      XLA_FFI_Error* _c_status = new XLA_FFI_Error{std::move(_status)}; \
+      return _c_status;                                                 \
+    }                                                                   \
+  } while (false)
+
+static XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_Error_Create", XLA_FFI_Error_Create_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  return new XLA_FFI_Error{Status(ToStatusCode(args->errc), args->message)};
+}
+
+static XLA_FFI_Error* XLA_FFI_Handler_Register(
+    XLA_FFI_Handler_Register_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_Handler_Register", XLA_FFI_Handler_Register_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  if (auto status = RegisterHandler(args->name, args->handler); !status.ok()) {
+    return new XLA_FFI_Error{std::move(status)};
+  }
+  return nullptr;
+}
+
+static XLA_FFI_Error* XLA_FFI_Error_Forward(void* status) {
+  return new XLA_FFI_Error{std::move(*reinterpret_cast<Status*>(status))};
+}
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api access
+//===----------------------------------------------------------------------===//
+
+static XLA_FFI_InternalApi internal_api = {
+    XLA_FFI_Error_Forward,
+};
+
+static XLA_FFI_Api api = {
+    XLA_FFI_Api_STRUCT_SIZE,
+    /*priv=*/nullptr,
+
+    &internal_api,
+
+    XLA_FFI_Error_Create,      // creates error
+    XLA_FFI_Handler_Register,  // registers handler
+};
+
+XLA_FFI_Api* GetXlaFfiApi() { return &api; }
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
new file mode 100644
index 00000000000000..dda0923bdca597
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -0,0 +1,105 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_H_
+#define XLA_FFI_FFI_H_
+
+#ifdef TENSORFLOW_COMPILER_XLA_FFI_API_FFI_H_
+#error Two different XLA FFI implementations cannot be included together
+#endif  // XLA_FFI_API_FFI_H_
+
+#include <cstdint>
+#include <optional>
+#include <string_view>
+
+// IWYU pragma: begin_exports
+#include "xla/ffi/api/api.h"
+// IWYU pragma: end_exports
+
+#include "absl/types/span.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::ffi {
+
+//===----------------------------------------------------------------------===//
+// Arguments
+//===----------------------------------------------------------------------===//
+
+struct Buffer {
+  PrimitiveType primitive_type;
+  se::DeviceMemoryBase data;
+  absl::Span<const int64_t> dimensions;
+};
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ArgDecoding<Buffer> {
+  static std::optional<Buffer> Decode(XLA_FFI_ArgType type, void* arg) {
+    if (type != XLA_FFI_ArgType_BUFFER) return std::nullopt;
+    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+
+    Buffer buffer;
+    buffer.primitive_type = PrimitiveType(buf->primitive_type);
+    buffer.data = se::DeviceMemoryBase(buf->data);
+    buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
+    return buffer;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Result encoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ResultEncoding<Status> {
+  static XLA_FFI_Error* Encode(XLA_FFI_Api* api, Status status) {
+    return api->internal_api->XLA_FFI_Error_Forward(&status);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Result encoding
+//===----------------------------------------------------------------------===//
+
+// Unwraps XLA FFI error and returns an underlying status.
+// Note: if not null, `error` is freed.
+Status Unwrap(XLA_FFI_Error* error);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI registry
+//===----------------------------------------------------------------------===//
+
+// Returns registered FFI handler for a given name, or an error if it's not
+// found in the static registry.
+StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api Implementation
+//===----------------------------------------------------------------------===//
+
+XLA_FFI_Api* GetXlaFfiApi();
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_FFI_H_
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
new file mode 100644
index 00000000000000..e7f45d2e89e6c5
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/ffi.h"
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace xla::ffi {
+
+TEST(FfiTest, StaticRegistration) {
+  static constexpr auto* noop = +[] { return absl::OkStatus(); };
+
+  XLA_FFI_DEFINE_HANDLER(NoOp, noop, Ffi::Bind());
+  XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op", NoOp);
+
+  auto handler = FindHandler("no-op");
+  TF_ASSERT_OK(handler.status());
+}
+
+TEST(FfiTest, ForwardError) {
+  auto call_frame = CallFrameBuilder().Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+  auto handler = Ffi::Bind().To([] { return absl::AbortedError("Ooops!"); });
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  ASSERT_EQ(status.message(), "Ooops!");
+}
+
+TEST(FfiTest, WrongNumArgs) {
+  CallFrameBuilder builder;
+  builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
+  auto call_frame = builder.Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+
+  auto handler = Ffi::Bind().Arg<Buffer>().Arg<Buffer>().To(
+      [](Buffer, Buffer) { return absl::OkStatus(); });
+
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  ASSERT_EQ(status.message(),
+            "Wrong number of arguments: expected 2 but got 1");
+}
+
+TEST(FfiTest, WrongNumAttrs) {
+  CallFrameBuilder builder;
+  builder.AddI32Attr("i32", 42);
+  builder.AddF32Attr("f32", 42.0f);
+  auto call_frame = builder.Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+
+  auto handler = Ffi::Bind().Attr<int32_t>("i32").To(
+      [](int32_t) { return absl::OkStatus(); });
+
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  ASSERT_EQ(status.message(),
+            "Wrong number of attributes: expected 1 but got 2");
+}
+
+TEST(FfiTest, BuiltinAttributes) {
+  CallFrameBuilder builder;
+  builder.AddI32Attr("i32", 42);
+  builder.AddF32Attr("f32", 42.0f);
+  builder.AddStringAttr("str", "foo");
+  auto call_frame = builder.Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+
+  auto fn = [&](int32_t i32, float f32, std::string_view str) {
+    EXPECT_EQ(i32, 42);
+    EXPECT_EQ(f32, 42.0f);
+    EXPECT_EQ(str, "foo");
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind()
+                     .Attr<int32_t>("i32")
+                     .Attr<float>("f32")
+                     .Attr<std::string_view>("str")
+                     .To(fn);
+
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, DecodingErrors) {
+  CallFrameBuilder builder;
+  builder.AddI32Attr("i32", 42);
+  builder.AddF32Attr("f32", 42.0f);
+  builder.AddStringAttr("str", "foo");
+  auto call_frame = builder.Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+
+  auto fn = [](int32_t, float, std::string_view) { return absl::OkStatus(); };
+
+  auto handler = Ffi::Bind()
+                     .Attr<int32_t>("not_i32_should_fail")
+                     .Attr<float>("f32")
+                     .Attr<std::string_view>("not_str_should_fail")
+                     .To(fn);
+
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  ASSERT_EQ(
+      status.message(),
+      "Failed to decode all FFI handler operands (bad operands at: 0, 2)");
+}
+
+TEST(FfiTest, BufferArgument) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build(GetXlaFfiApi(), /*ctx=*/nullptr);
+
+  auto fn = [&](Buffer buffer) {
+    EXPECT_EQ(buffer.data.opaque(), storage.data());
+    EXPECT_EQ(buffer.primitive_type, PrimitiveType::F32);
+    EXPECT_EQ(buffer.dimensions.size(), 2);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Arg<Buffer>().To(fn);
+  auto status = Unwrap(handler->Call(call_frame.call_frame()));
+  TF_ASSERT_OK(status);
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index d19e9205d53cbd..5be2d22e7c7516 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2521,19 +2521,20 @@ AutoShardingSolverResult CallSolver(
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
   // Serialize node costs
+  int num_nodes_without_default = 0;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
     const StrategyVector* strategies = leaf_strategies[node_idx];
     auto instruction_name = instructions.at(strategies->instruction_id)->name();
     request.instruction_names.push_back(
         absl::StrCat(instruction_name, " (id: ", node_idx, ")"));
     std::vector<double> ci, di, mi, pi;
-    auto default_strategy = HloSharding::Replicate();
+    std::optional<HloSharding> default_strategy;
     auto iter = sharding_propagation_solution.find(instruction_name);
     if (iter != sharding_propagation_solution.end()) {
       CHECK(iter->second->has_sharding()) << iter->second->ToString();
       default_strategy = iter->second->sharding();
       if (strategies->tuple_element_idx) {
-        const auto& tuple_elements = default_strategy.tuple_elements();
+        const auto& tuple_elements = default_strategy->tuple_elements();
         CHECK_LT(*strategies->tuple_element_idx, tuple_elements.size());
         default_strategy = tuple_elements.at(*strategies->tuple_element_idx);
       }
@@ -2545,15 +2546,20 @@ AutoShardingSolverResult CallSolver(
       di.push_back(strategy.communication_cost +
                    cost_graph.extra_node_costs_[node_idx][j]);
       mi.push_back(strategy.memory_cost);
-      // TODO(moffitt): Revisit the default strategy below, which is currently
-      // defined as the "trivial sharding" in hlo_sharding.h
-      pi.push_back(sharding == default_strategy ? 0.0 : 1.0);
+      pi.push_back(default_strategy && sharding == *default_strategy ? 0 : 1);
+    }
+    if (*std::min_element(pi.begin(), pi.end()) > 0) {
+      LOG(WARNING) << "No default strategy for {node_idx " << node_idx
+                   << ", instruction ID " << strategies->instruction_id
+                   << ", instruction name " << instruction_name << "}";
+      ++num_nodes_without_default;
     }
     request.c.push_back(ci);
     request.d.push_back(di);
     request.m.push_back(mi);
     request.p.push_back(pi);
   }
+  LOG(INFO) << "Total nodes without default: " << num_nodes_without_default;
 
   // Serialize special edges that forces a alias pair have the same sharding
   // spec
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
index 758b54e4ef28f4..fd1c250a2d416e 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
@@ -77,6 +77,7 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
index a32511c5b1796f..63a2e723eb9502 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -57,8 +58,10 @@ class OutlineGpuGraphsPass
     : public impl::OutlineGpuGraphsPassBase<OutlineGpuGraphsPass> {
  public:
   OutlineGpuGraphsPass() = default;
-  explicit OutlineGpuGraphsPass(int gpu_graph_level, int min_graph_size)
-      : gpu_graph_level_(gpu_graph_level) {
+  explicit OutlineGpuGraphsPass(
+      absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
+      int min_graph_size)
+      : command_types_(std::move(command_types)) {
     this->min_graph_size_ = min_graph_size;
   }
 
@@ -69,6 +72,8 @@ class OutlineGpuGraphsPass
   }
 
  private:
+  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types_ = {
+      DebugOptions::FUSION, DebugOptions::CUBLAS, DebugOptions::CUDNN};
   int gpu_graph_level_ = 3;
 };
 
@@ -360,6 +365,10 @@ static LogicalResult Outline(unsigned ordinal,
   // If an argument to parent_func has the "lmhlo.constant_name" attribute and
   // is passed to the graph capture function, we propagate the attribute the
   // graph capture function.
+  //
+  // We also annotate all arguments with "rt.allocation_index" attribute that
+  // allows us to forward correct arguments to graph capture function during
+  // Gpu executable initialization (see `InstantiateAllGraphs` implementation).
   for (unsigned i = 0; i < args.size(); ++i) {
     Value arg = args[i];
 
@@ -371,6 +380,12 @@ static LogicalResult Outline(unsigned ordinal,
     Block* parent_block = block_arg.getParentBlock();
     if (!parent_block->isEntryBlock()) continue;
 
+    // If this is an argument to the entry block of the parent function, it
+    // means that it's the XLA allocation, and we forward index to the capture
+    // function.
+    func.setArgAttr(i, "rt.allocation_index",
+                    b.getIndexAttr(block_arg.getArgNumber()));
+
     // Check that the parent_block is in the SSACFG region of parent_func.
     Region& parent_func_region = parent_func.getRegion();
     if (parent_block->getParent() != &parent_func_region) continue;
@@ -458,7 +473,7 @@ void OutlineGpuGraphsPass::runOnOperation() {
 
   OpCapturePatternSet patterns;
 
-  if (gpu_graph_level_ >= 1) {
+  if (command_types_.contains(DebugOptions::FUSION)) {
     // Enable capturing fusions and memcpies.
     patterns.emplace_back(new LaunchFuncOpCapture());
     patterns.emplace_back(new ConstantOpCapture());
@@ -467,12 +482,12 @@ void OutlineGpuGraphsPass::runOnOperation() {
     patterns.emplace_back(new ReinterpretCastOpCapture());
   }
 
-  if (gpu_graph_level_ >= 2) {
+  if (command_types_.contains(DebugOptions::CUBLAS)) {
     // Enable capturing gemms.
     patterns.emplace_back(new GemmOpCapture());
   }
 
-  if (gpu_graph_level_ >= 3) {
+  if (command_types_.contains(DebugOptions::CUDNN)) {
     // Enable capturing convolutions.
     patterns.emplace_back(new ConvForwardOpCapture());
     patterns.emplace_back(new ConvBackwardInputOpCapture());
@@ -494,9 +509,9 @@ std::unique_ptr<OperationPass<ModuleOp>> createOutlineGpuGraphsPass() {
 }
 
 std::unique_ptr<OperationPass<ModuleOp>> createOutlineGpuGraphsPass(
-    int gpu_graph_level, int min_graph_size) {
-  return std::make_unique<OutlineGpuGraphsPass>(gpu_graph_level,
-                                                min_graph_size);
+    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
+    int min_graph_size) {
+  return std::make_unique<OutlineGpuGraphsPass>(command_types, min_graph_size);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc
index 9f6ea047c7dc0e..8fcb6d148ed888 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc
@@ -15,15 +15,43 @@ limitations under the License.
 
 #include "xla/mlir/backends/gpu/transforms/passes.h"
 
+#include <cstdint>
+#include <vector>
+
 #include "absl/log/log.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_ops.h"
 
 namespace xla {
 namespace gpu {
 
 using namespace mlir;  // NOLINT
 
+std::vector<std::vector<int64_t>> GetAllocationIndices(mlir::ModuleOp module) {
+  std::vector<std::vector<int64_t>> res;
+
+  SymbolTable sym_table(module);
+  for (auto op : module.getOps<runtime::ExportOp>()) {
+    unsigned ordinal = *op.ordinal();
+    if (ordinal >= res.size()) res.resize(ordinal + 1);
+
+    auto func = sym_table.lookup<func::FuncOp>(op.getFunctionRef());
+    res[ordinal].resize(func.getNumArguments(), -1);
+
+    for (unsigned i = 0; i < func.getNumArguments(); ++i) {
+      auto idx = func.getArgAttrOfType<IntegerAttr>(i, "rt.allocation_index");
+      if (idx) res[ordinal][i] = idx.getInt();
+    }
+  }
+
+  return res;
+}
+
 void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
                                  ThunkSequence* thunk_sequence,
                                  const GpuPipelineOpts& opts) {
@@ -39,7 +67,7 @@ void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
 
   // Outline CUDA-Graph-compatible operations into graph capture functions.
   pm.addPass(
-      createOutlineGpuGraphsPass(opts.gpu_graph_level, opts.min_graph_size));
+      createOutlineGpuGraphsPass(opts.command_types, opts.min_graph_size));
   if (opts.enable_concurrent_region) {
     // Concurrent regions create repeated-fork-join topology inside CUDA graphs,
     // which is not optimized by architectures prior to Ampere and may cause
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
index 4c3fc987ea74b5..253eda6c3b3e1e 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
@@ -18,11 +18,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -40,11 +43,24 @@ namespace gpu {
 
 class ThunkSequence;  // forward declare
 
+// Collects `rt.allocation_index` attributes from all exported functions.
+//
+//   auto result = GetAllocationIndices();
+//   result[ordinal][argument_index] == allocation_index;
+//
+// Returns `-1` for all arguments that do not have `rt.allocation_index`
+// attribute.
+//
+// TODO(ezhulenev): This is a very ugly hack for graph capture integration, but
+// given that we are moving towards a new runtime and command buffers, it's
+// supposed to be a very short lived hack.
+std::vector<std::vector<int64_t>> GetAllocationIndices(mlir::ModuleOp module);
+
 struct GpuPipelineOpts {
   // Enable experimental pass that outlines parts of the XLA computation into
   // CUDA Graphs, which allows us to amortize the cost of launching multiple
   // device kernels.
-  int32_t gpu_graph_level = 0;
+  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types;
   int32_t min_graph_size = 0;
   bool enable_concurrent_region = false;
   stream_executor::GpuComputeCapability compute_capability;
@@ -106,7 +122,8 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOutlineGpuGraphsPass();
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createOutlineGpuGraphsPass(
-    int32_t gpu_graph_level, int32_t min_graph_size);
+    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
+    int32_t min_graph_size);
 
 //===----------------------------------------------------------------------===//
 // Passes for marking concurrent region in CUDA graph capture function.
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
index 9a25e7b1600cf8..cd286419770ce5 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
@@ -146,13 +146,17 @@ func.func @func(%arg0: memref<?xf32>) {
 func.func private @external()
 
 // CHECK: rt.export @[[CAPTURE]]
-// CHECK: func.func @[[CAPTURE]](%arg0: memref<?xf32>)
+// CHECK: func.func @[[CAPTURE]](
+// CHECK:   %arg0: memref<?xf32>
+// CHECK: )
 // CHECK-NEXT: arith.constant 1
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
 
 // CHECK: rt.export @[[CAPTURE_0]]
-// CHECK: func.func @[[CAPTURE_0]](%arg0: memref<?xf32>)
+// CHECK: func.func @[[CAPTURE_0]](
+// CHECK:   %arg0: memref<?xf32>
+// CHECK: )
 // CHECK-NEXT: arith.constant 2
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
@@ -665,8 +669,9 @@ func.func @func(%arg0: memref<?xf32> {lmhlo.constant_name = "cst0"},
 }
 
 // CHECK: func @local_xla.gpu.graph.capture(
-// CHECK-SAME:  %[[ARG0]]: memref<?xf32> {lmhlo.constant_name = "cst0"},
-// CHECK-SAME:  %[[ARG1]]: memref<?xf32> {lmhlo.constant_name = "cst1"})
+// CHECK-SAME:  %[[ARG0]]: memref<?xf32> {lmhlo.constant_name = "cst0",
+// CHECK-SAME:  %[[ARG1]]: memref<?xf32> {lmhlo.constant_name = "cst1",
+// CHECK-SAME: )
 // CHECK-NEXT:  %[[C1:.*]] = arith.constant 1
 // CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
 // CHECK-SAME:    blocks in (%[[C1]], %[[C1]], %[[C1]])
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index a1497f5e91b7bf..d9151d90a5c200 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -33,6 +33,7 @@ cc_library(
         "convert_asserts.cc",
         "convert_custom_calls.cc",
         "export_functions.cc",
+        "move_allocas_to_entry_block.cc",
         "ordinal_assignment.cc",
         "rt_to_llvm.cc",
     ],
@@ -47,20 +48,20 @@ cc_library(
         "//xla/runtime:custom_call",
         "//xla/runtime:tracing",
         "//xla/runtime:type_id",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -103,11 +104,11 @@ cc_library(
     deps = [
         ":compilation_pipeline_options",
         ":compiler",
-        ":custom_call_encoding",
         ":passes",
         "//xla/mlir/backends/cpu/transforms:passes",
         "//xla/mlir/math/transforms:passes",
         "//xla/mlir/memref/transforms:passes",
+        "//xla/mlir/runtime/ir:rt",
         "//xla/mlir_hlo:transforms_passes",
         "//xla/runtime:compiler",
         "@llvm-project//mlir:AMXToLLVMIRTranslation",
@@ -125,24 +126,21 @@ cc_library(
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
-        "@llvm-project//mlir:MathToLibm",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:X86VectorToLLVMIRTranslation",
     ],
     alwayslink = 1,  # has pipeline registration
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
index 995a5f49dde03d..50488cad7f13ff 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
@@ -20,14 +20,10 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/MathToLibm/MathToLibm.h"  // from @llvm-project
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/Transforms/Passes.h"  // from @llvm-project
@@ -37,13 +33,15 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ArmNeon/ArmNeonToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ArmSVE/ArmSVEToLLVMIRTranslation.h"  // from @llvm-project
@@ -54,8 +52,9 @@ limitations under the License.
 #include "xla/mlir/backends/cpu/transforms/passes.h"
 #include "xla/mlir/math/transforms/passes.h"
 #include "xla/mlir/memref/transforms/passes.h"
+#include "xla/mlir/runtime/ir/rt_dialect.h"
+#include "xla/mlir/runtime/transforms/compilation_pipeline_options.h"
 #include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 #include "xla/mlir_hlo/transforms/passes.h"
 
@@ -113,6 +112,9 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
   // Lower from high level async operations to async runtime.
   pm.addPass(mlir::createAsyncToAsyncRuntimePass());
 
+  // Move all memref.alloca to entry block for all functions.
+  pm.addPass(CreateMoveAllocasToEntryBlockPass());
+
   // Add async.runtime reference counting operations.
   pm.addPass(mlir::createAsyncRuntimePolicyBasedRefCountingPass());
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc b/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc
new file mode 100644
index 00000000000000..7a9a892770b8e1
--- /dev/null
+++ b/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc
@@ -0,0 +1,69 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "xla/mlir/runtime/transforms/passes.h"
+
+namespace xla {
+namespace runtime {
+
+using namespace mlir;  // NOLINT
+
+#define GEN_PASS_DEF_MOVEALLOCASTOENTRYBLOCK
+#include "xla/mlir/runtime/transforms/passes.h.inc"
+
+class MoveAllocasToEntryBlockPass
+    : public impl::MoveAllocasToEntryBlockBase<MoveAllocasToEntryBlockPass> {
+  void runOnOperation() override;
+};
+
+//===----------------------------------------------------------------------====/
+
+void MoveAllocasToEntryBlockPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  module.walk([](mlir::func::FuncOp func) {
+    CHECK(!func.getBlocks().empty());
+    Block* entryBlock = &func.getBlocks().front();
+    llvm::SmallVector<memref::AllocaOp> allocas;
+    for (auto op : func.getOps<memref::AllocaOp>()) {
+      if (op->getBlock() != entryBlock) {
+        allocas.push_back(op);
+      }
+    }
+
+    auto builder =
+        ImplicitLocOpBuilder::atBlockBegin(func->getLoc(), entryBlock);
+    builder.setInsertionPointToStart(entryBlock);
+    for (auto op : allocas) {
+      op->moveBefore(builder.getInsertionBlock(), builder.getInsertionPoint());
+    }
+  });
+}
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateMoveAllocasToEntryBlockPass() {
+  return std::make_unique<MoveAllocasToEntryBlockPass>();
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/passes.h b/third_party/xla/xla/mlir/runtime/transforms/passes.h
index 8fa0918dc1ee13..9f186607783f5a 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/passes.h
+++ b/third_party/xla/xla/mlir/runtime/transforms/passes.h
@@ -28,6 +28,7 @@ namespace xla {
 namespace runtime {
 
 #define GEN_PASS_DECL_ORDINALASSIGNMENT
+#define GEN_PASS_DECL_MOVEALLOCASTOENTRYBLOCK
 #define GEN_PASS_DECL_EXPORTFUNCTIONS
 #define GEN_PASS_DECL_CONVERTCUSTOMCALLS
 #define GEN_PASS_DECL_CONVERTASSERTS
@@ -42,6 +43,9 @@ namespace runtime {
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateOrdinalAssignmentPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateMoveAllocasToEntryBlockPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateExportRuntimeFunctionsPass();
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/passes.td b/third_party/xla/xla/mlir/runtime/transforms/passes.td
index 4b2a736be2aa02..aadf975e6d507a 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/passes.td
+++ b/third_party/xla/xla/mlir/runtime/transforms/passes.td
@@ -148,6 +148,17 @@ def ConvertCustomCalls : Pass<"xla-rt-convert-custom-calls", "ModuleOp"> {
   let dependentDialects = ["xla::runtime::RuntimeDialect"];
 }
 
+def MoveAllocasToEntryBlock : Pass<"xla-rt-move-allocas-to-entry-block", "ModuleOp"> {
+  let summary = "Move all `memref.alloca` to entry block for all functions";
+
+  let description = [{
+    For all functions, move `memref.alloca` to entry block for coroutine stack safety.
+  }];
+
+  let constructor = "xla::runtime::CreateMoveAllocasToEntryBlockPass()";
+  let dependentDialects = ["xla::runtime::RuntimeDialect"];
+}
+
 def ConvertAsserts : Pass<"xla-rt-convert-asserts", "ModuleOp"> {
   let summary = "Converts asserts in exported functions to run-time errors";
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/tests/move_allocas_to_entry_block.mlir b/third_party/xla/xla/mlir/runtime/transforms/tests/move_allocas_to_entry_block.mlir
new file mode 100644
index 00000000000000..dff76dfc312c20
--- /dev/null
+++ b/third_party/xla/xla/mlir/runtime/transforms/tests/move_allocas_to_entry_block.mlir
@@ -0,0 +1,55 @@
+// RUN: xla-runtime-opt %s --xla-rt-move-allocas-to-entry-block | FileCheck %s
+
+func.func @compute(
+  %arg0: !rt.execution_context,
+  %arg1: !async.value<memref<f32>>
+) -> !async.token attributes {passthrough = ["presplitcoroutine"]} {
+  // CHECK:   %alloca = memref.alloca() {alignment = 64 : i64} : memref<f32>
+  // CHECK:   %0 = async.runtime.create : !async.token
+  // CHECK:   %1 = async.coro.id
+  // CHECK:   %2 = async.coro.begin %1
+  // CHECK:   %3 = async.coro.save %2
+  // CHECK:   async.runtime.resume %2
+  // CHECK:   async.coro.suspend %3, ^bb9, ^bb1, ^bb8
+  // CHECK: ^bb1:  // pred: ^bb0
+  // CHECK:   %status = rt.call %arg0["test.producer"] (%alloca)
+  // CHECK:     : (memref<f32>) -> ()
+  %0 = async.runtime.create : !async.token
+  %1 = async.coro.id
+  %2 = async.coro.begin %1
+  %3 = async.coro.save %2
+  async.runtime.resume %2
+  async.coro.suspend %3, ^bb9, ^bb1, ^bb8
+^bb1:  // pred: ^bb0
+  %alloca = memref.alloca() {alignment = 64 : i64} : memref<f32>
+  %status = rt.call %arg0["test.producer"] (%alloca) : (memref<f32>) -> ()
+  %4 = rt.is_ok %status
+  cf.cond_br %4, ^bb2, ^bb6
+^bb2:  // pred: ^bb1
+  %5 = async.coro.save %2
+  async.runtime.await_and_resume %arg1, %2 : !async.value<memref<f32>>
+  async.coro.suspend %5, ^bb9, ^bb3, ^bb8
+^bb3:  // pred: ^bb2
+  %6 = async.runtime.is_error %arg1 : !async.value<memref<f32>>
+  cf.cond_br %6, ^bb6, ^bb4
+^bb4:  // pred: ^bb3
+  %7 = async.runtime.load %arg1 : <memref<f32>>
+  %status_0 = rt.call %arg0["test.consumer"] (%alloca) : (memref<f32>) -> ()
+  %8 = rt.is_ok %status_0
+  cf.cond_br %8, ^bb5, ^bb6
+^bb5:  // pred: ^bb4
+  async.runtime.set_available %0 : !async.token
+  cf.br ^bb7
+^bb6:  // 3 preds: ^bb1, ^bb3, ^bb4
+  async.runtime.set_error %0 : !async.token
+  cf.br ^bb7
+^bb7:  // 2 preds: ^bb5, ^bb6
+  async.coro.free %1, %2
+  cf.br ^bb9
+^bb8:  // 2 preds: ^bb0, ^bb2
+  async.coro.free %1, %2
+  cf.br ^bb9
+^bb9:  // 4 preds: ^bb0, ^bb2, ^bb7, ^bb8
+  async.coro.end %2
+  return %0 : !async.token
+}
diff --git a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
index 1c742e531bcb67..dd828c3d3de68b 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
@@ -165,12 +165,15 @@ def LHLOGPU_GEMMOp : LHLOGPU_Op<"gemm"> {
     Arg<LHLO_Buffer, "", [MemRead]>:$a,
     Arg<LHLO_Buffer, "", [MemRead]>:$b,
     Arg<LHLO_Buffer, "", [MemRead, MemWrite]>:$c,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$workspace,
     MHLO_DotDimensionNumbers:$dot_dimension_numbers,
     MHLO_PrecisionConfigAttr:$precision_config,
     F64Attr:$alpha_real,
     F64Attr:$alpha_imag,
     F64Attr:$beta,
-    OptionalAttr<I64Attr>:$algorithm);
+    OptionalAttr<I64Attr>:$algorithm,
+    OptionalAttr<BoolAttr>:$grad_x,
+    OptionalAttr<BoolAttr>:$grad_y);
 }
 
 def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul", [AttrSizedOperandSegments]> {
@@ -187,7 +190,9 @@ def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul", [AttrSizedOperandS
     F64Attr:$alpha_imag,
     F64Attr:$beta,
     CublasLtMatmulEpilogueAttr:$epilogue,
-    I64Attr:$algorithm);
+    I64Attr:$algorithm,
+    OptionalAttr<BoolAttr>:$grad_x,
+    OptionalAttr<BoolAttr>:$grad_y);
 }
 
 def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8", [AttrSizedOperandSegments]> {
@@ -208,7 +213,9 @@ def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8", [AttrSizedOpe
     F64Attr:$alpha_imag,
     F64Attr:$beta,
     CublasLtMatmulEpilogueAttr:$epilogue,
-    I64Attr:$algorithm);
+    I64Attr:$algorithm,
+    OptionalAttr<BoolAttr>:$grad_x,
+    OptionalAttr<BoolAttr>:$grad_y);
 }
 
 def LHLOGPU_CholeskyOp : LHLOGPU_Op<"cholesky"> {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index 166b13284302d1..4774aac93d4b56 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -107,13 +107,6 @@ bool hasExperimentalFeaturesNotInStablehlo(HloOpTy hloOp) {
     // Proposal: https://github.com/openxla/stablehlo/issues/742.
     if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
   }
-  if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
-    // StableHLO CustomCall doesn't support API_VERSION_TYPED_FFI yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/637.
-    if (hloOp.getApiVersion() ==
-        mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
-      return true;
-  }
   if constexpr (std::is_same<HloOpTy, mhlo::DotGeneralOp>::value) {
     // StableHLO DotGeneral doesn't support PACKED_NIBBLE yet.
     // Proposal: https://github.com/openxla/stablehlo/issues/742.
@@ -132,13 +125,21 @@ bool hasExperimentalFeaturesNotInStablehlo(HloOpTy hloOp) {
 // frontends but are not yet part of StableHLO. Such features might be a good
 // fit for StableHLO, and are usually accompanied by a StableHLO GitHub ticket.
 template <typename HloOpTy>
-std::optional<int64_t> getPublicFeaturesNotInStablehlo(HloOpTy) {
+std::optional<int64_t> getPublicFeaturesNotInStablehlo(HloOpTy hloOp) {
   // StableHLO doesn't support TanOp yet.
   // Proposal: https://github.com/openxla/stablehlo/issues/954
   if constexpr (std::is_same<HloOpTy, mhlo::TanOp>::value) {
     // Version 1: Initial version for TanOp.
     return 1;
   }
+  // StableHLO CustomCall doesn't support API_VERSION_TYPED_FFI yet.
+  // Proposal: https://github.com/openxla/stablehlo/issues/637.
+  if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
+    // Version 1: Initial version for TYPED_FFI
+    if (hloOp.getApiVersion() ==
+        mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
+      return 1;
+  }
   return std::nullopt;
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 3675c705053251..a3c401a7b1abe1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -263,6 +263,34 @@ LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
   return success();
 }
 
+// Preserve backward compatibility of typed_ffi custom calls by converting:
+// `stablehlo.custom_call @foo(%arg0) { mhlo.backend_config = {...} }`
+// ==>
+// `mhlo.custom_call @foo(%arg0) { backend_config = {...}, api_version = 4}`
+//
+// Fails if StableHLO op has non-empty backend_config, or uses API version
+// other than API_VERSION_ORIGINAL.
+LogicalResult fixupMhloBackendConfig(stablehlo::CustomCallOp stablehloOp,
+                                     mhlo::CustomCallOp hloOp) {
+  auto stablehloBackendConfig = stablehloOp->getAttr("mhlo.backend_config");
+  if (stablehloBackendConfig) {
+    if (auto oldHloBackendConfig =
+            hloOp.getBackendConfigAttr()
+                .template dyn_cast_or_null<StringAttr>()) {
+      if (!oldHloBackendConfig.empty()) return failure();
+    } else {
+      return failure();
+    }
+    if (stablehloOp.getApiVersion() !=
+        stablehlo::CustomCallApiVersion::API_VERSION_ORIGINAL)
+      return failure();
+
+    hloOp.setBackendConfigAttr(stablehloBackendConfig);
+    hloOp.setApiVersion(mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  }
+  return success();
+}
+
 template <typename StablehloOpTy>
 class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
  public:
@@ -323,23 +351,9 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
           stablehloOp, hloTypes, hloOperands, hloAttrs);
     }
 
+    // For backward compatibility, fix custom call with mhlo.backend_config
     if constexpr (std::is_same<StablehloOpTy, stablehlo::CustomCallOp>::value) {
-      auto stablehloBackendConfig = stablehloOp->getAttr("mhlo.backend_config");
-      if (stablehloBackendConfig) {
-        if (auto oldHloBackendConfig =
-                hloOp.getBackendConfigAttr()
-                    .template dyn_cast_or_null<StringAttr>()) {
-          if (oldHloBackendConfig != "") return failure();
-        } else {
-          return failure();
-        }
-        if (stablehloOp.getApiVersion() !=
-            stablehlo::CustomCallApiVersion::API_VERSION_ORIGINAL)
-          return failure();
-
-        hloOp.setBackendConfigAttr(stablehloBackendConfig);
-        hloOp.setApiVersion(mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI);
-      }
+      if (failed(fixupMhloBackendConfig(stablehloOp, hloOp))) return failure();
     }
 
     // Finally, populate the regions while converting argument types
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
index a7454be4c81788..da2d9bc1f28ecc 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
@@ -2573,6 +2573,20 @@ func.func @fold_sitosi() -> tensor<i32> {
   func.return %1 : tensor<i32>
 }
 
+func.func @fold_predtosi() -> tensor<i8> {
+  %0 = mhlo.constant dense<false> : tensor<i1>
+  // CHECK: mhlo.constant dense<0> : tensor<i8>
+  %1 = "mhlo.convert"(%0) : (tensor<i1>) -> tensor<i8>
+  func.return %1 : tensor<i8>
+}
+
+func.func @not_fold_itouq() -> tensor<!quant.uniform<i8:f32, 1.000000e+00:3>> {
+  // CHECK: mhlo.constant dense<1> : tensor<i8>
+  %0 = mhlo.constant dense<1> : tensor<i8>
+  %1 = "mhlo.convert"(%0) : (tensor<i8>) -> tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
+  func.return %1 : tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
+}
+
 // CHECK-LABEL: @eliminate_redundant_reshape
 func.func @eliminate_redundant_reshape(%arg : tensor<1x32xi16>) -> tensor<1x32xi16> {
   %0 = "mhlo.reshape"(%arg) : (tensor<1x32xi16>) -> tensor<2x16xi16>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
index fb1a9e041f49f0..ed2d07b367abc0 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -42,23 +42,6 @@ func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32
 
 // -----
 
-// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
-func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.custom_call"(%arg0) {
-  // CHECK-SAME:   call_target_name = "mhlo.custom_call"
-  // CHECK-SAME:   mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"}
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
-  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
-  %0 = "mhlo.custom_call"(%arg0) {
-    call_target_name = "foo",
-    backend_config = {foo = "bar"},
-    api_version = 4 : i32
-  } : (tensor<f32>) -> tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// -----
-
 // CHECK-LABEL: "attr_precision_packed_nibble"
 func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 0ad2e0165b76ad..6de9c1a64b37b7 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -152,6 +152,23 @@ func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f
   func.return %0 : tensor<f32>
 }
 
+// -----
+
+// CHECK-LABEL: "attr_custom_call_api_version_typed_ffi"
+func.func @attr_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
+  //      CHECK: "stablehlo.custom_call"(%arg0) {
+  // CHECK-SAME:   call_target_name = "mhlo.custom_call"
+  // CHECK-SAME:   mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"},
+  // CHECK-SAME:   mhlo.version = 1 : i64
+  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  %0 = "mhlo.custom_call"(%arg0) {
+    call_target_name = "foo",
+    backend_config = {foo = "bar"},
+    api_version = 4 : i32
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
 // CustomCallSchedule aka #mhlo<custom_call_schedule> is unsupported at the moment (see negative test below).
 // DequantizeMode aka #mhlo<dequantize_mode> is unused at the moment.
 // DomainKind aka #mhlo<kind> is unsupported at the moment (see negative test below).
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index d1dc28fed0e868..af8419890635ab 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -715,7 +715,8 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
   // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "mhlo.custom_call",
-    mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"}
+    mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"},
+    mhlo.version = 1 : i64
   } : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
@@ -729,7 +730,7 @@ func.func @op_custom_call_mhlo_backend_config(%arg0: tensor<16x256xbf16>) -> ten
   // CHECK-SAME: } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
   %4 = stablehlo.custom_call @foo(%arg0) {
     "mhlo.backend_config" = {aggregate_to_topk = true}
-    } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
+  } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
   return %4 : tensor<16x4xbf16>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc b/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
index 55ba8de7750bd1..9421c6eb93aad8 100644
--- a/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
@@ -31,7 +31,8 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
                                        mlir::Type newType) {
   auto oldType = getElementTypeOrSelf(elements);
   // TODO(kramerb): Add support when MLIR can represent const complex tensors.
-  if (oldType.isa<mlir::ComplexType>() || newType.isa<mlir::ComplexType>()) {
+  if (!oldType.isa<mlir::IntegerType, mlir::FloatType>() ||
+      !newType.isa<mlir::IntegerType, mlir::FloatType>()) {
     return {};
   }
 
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 908d96082b68e8..79f7d3bef7e467 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,33 +1,32 @@
 # PJRT C API changelog
 
-## 0.35 (Oct 20, 2023)
+## 0.37 (Oct 27, 2023)
+* Added const to a bunch of lists and value types.
+
+## 0.36 (Oct 24, 2023)
+* Added PJRT_Client_TopologyDescription
 
+## 0.35 (Oct 20, 2023)
 * Added PJRT_Executable_Fingerprint method
 * Deprecated PJRT_LoadedExecutable_Fingerprint
 
 ## 0.34 (Oct 9, 2023)
-
 * Added PJRT_Structure_Type::PJRT_Structure_Type_Profiler.
 
 ## 0.33 (Oct 3, 2023)
-
 * Added PJRT_Client_CreateViewOfDeviceBuffer.
 
 ## 0.32 (Sep 26, 2023)
-
 * Added PJRT_Buffer_CopyToMemory.
 
 ## 0.31 (Sep 22, 2023)
-
 * Added PJRT_Structure_Base.
 * Added PJRT_Structure_Type.
 * Renamed PJRT_Api.priv to PJRT_Api.extension_start.
 
 ## 0.30 (Sep 14, 2023)
-
 * Added PJRT_NamedValue_Type::PJRT_NamedValue_kBool.
 
 ## 0.29 (Sep 6, 2023)
-
 * Added PJRT_Executable_OutputElementTypes.
-* Added PJRT_Executable_OutputDimensions.
\ No newline at end of file
+* Added PJRT_Executable_OutputDimensions.
diff --git a/third_party/xla/xla/pjrt/c/README.md b/third_party/xla/xla/pjrt/c/README.md
index e93d8101cf3bd9..cdead8e4d01b61 100644
--- a/third_party/xla/xla/pjrt/c/README.md
+++ b/third_party/xla/xla/pjrt/c/README.md
@@ -9,10 +9,11 @@ opaque to the frameworks.
 ## Communication channel
 
 *   Please file issues in the [OpenXla/xla repo](https://github.com/openxla/xla).
-*   Join discussion in the #pjrt-plugin channel of the [IREE discord server](https://github.com/openxla/iree/#communication-channels).
+*   Join the [pjrt-announcement maillist](https://groups.google.com/g/pjrt-announce/).
 
 ## Resources
 
-*   [OpenXLA/IREE PJRT plugin implementation](https://github.com/openxla/openxla-pjrt-plugin)
+*   [PJRT C API changelog](https://github.com/openxla/xla/blob/main/xla/pjrt/c/CHANGELOG.md)
 *   [PJRT integration guide](https://github.com/openxla/xla/blob/main/xla/pjrt/c/docs/pjrt_integration_guide.md)
 *   [PJRT Plugin Mechanism design doc](https://docs.google.com/document/d/1Qdptisz1tUPGn1qFAVgCV2omnfjN01zoQPwKLdlizas/edit)
+*   [OpenXLA/IREE PJRT plugin implementation](https://github.com/openxla/openxla-pjrt-plugin)
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 8295597ee0e586..d8282069d06b4e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,7 +53,7 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 35
+#define PJRT_API_MINOR 37
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -183,8 +183,8 @@ struct PJRT_Plugin_Attributes_Args {
   size_t struct_size;
   void* priv;
   // Returned attributes have the lifetime of the process.
-  PJRT_NamedValue* attributes;  // out
-  size_t num_attributes;        // out
+  const PJRT_NamedValue* attributes;  // out
+  size_t num_attributes;              // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, attributes);
 
@@ -282,6 +282,7 @@ typedef struct PJRT_Client PJRT_Client;
 typedef struct PJRT_Device PJRT_Device;
 typedef struct PJRT_Memory PJRT_Memory;
 typedef struct PJRT_DeviceDescription PJRT_DeviceDescription;
+typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
 typedef struct PJRT_Executable PJRT_Executable;
 typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
 typedef struct PJRT_Buffer PJRT_Buffer;
@@ -345,7 +346,7 @@ struct PJRT_Client_Create_Args {
   size_t struct_size;
   void* priv;
   // Extra platform-specific options to create a client.
-  PJRT_NamedValue* create_options;
+  const PJRT_NamedValue* create_options;
   size_t num_options;
   // Key-value get/put callback provided by the caller of PJRT_Client_Create.
   // PJRT client can use these callbacks to share information between
@@ -418,12 +419,26 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_PlatformVersion_Args,
 typedef PJRT_Error* PJRT_Client_PlatformVersion(
     PJRT_Client_PlatformVersion_Args* args);
 
+struct PJRT_Client_TopologyDescription_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Client* client;
+  // Is owned by and has the same lifetime as `client`.
+  PJRT_TopologyDescription* topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_TopologyDescription_Args, topology);
+
+// Returns the topology description of the runtime topology. The returned
+// topology is owned by the client and should not be deleted by the caller.
+typedef PJRT_Error* PJRT_Client_TopologyDescription(
+    PJRT_Client_TopologyDescription_Args* args);
+
 struct PJRT_Client_Devices_Args {
   size_t struct_size;
   void* priv;
   PJRT_Client* client;
-  PJRT_Device** devices;  // out
-  size_t num_devices;     // out
+  PJRT_Device* const* devices;  // out
+  size_t num_devices;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Devices_Args, num_devices);
 
@@ -435,8 +450,8 @@ struct PJRT_Client_AddressableDevices_Args {
   size_t struct_size;
   void* priv;
   PJRT_Client* client;
-  PJRT_Device** addressable_devices;  // out
-  size_t num_addressable_devices;     // out
+  PJRT_Device* const* addressable_devices;  // out
+  size_t num_addressable_devices;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_AddressableDevices_Args,
                           num_addressable_devices);
@@ -483,8 +498,8 @@ struct PJRT_Client_AddressableMemories_Args {
   size_t struct_size;
   void* priv;
   PJRT_Client* client;
-  PJRT_Memory** addressable_memories;  // out
-  size_t num_addressable_memories;     // out
+  PJRT_Memory* const* addressable_memories;  // out
+  size_t num_addressable_memories;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_AddressableMemories_Args,
                           num_addressable_memories);
@@ -518,7 +533,7 @@ struct PJRT_Client_Compile_Args {
   PJRT_Client* client;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
-  PJRT_Program* program;
+  const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
   // Serialized CompileOptionsProto
   // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
@@ -800,7 +815,7 @@ struct PJRT_DeviceDescription_Attributes_Args {
   void* priv;
   PJRT_DeviceDescription* device_description;
   size_t num_attributes;        // out
-  PJRT_NamedValue* attributes;  // out
+  const PJRT_NamedValue* attributes;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Attributes_Args, attributes);
 
@@ -898,8 +913,8 @@ struct PJRT_Device_AddressableMemories_Args {
   void* priv;
   PJRT_Device* device;
   // Has the lifetime of `device`.
-  PJRT_Memory** memories;  // out
-  size_t num_memories;     // out
+  PJRT_Memory* const* memories;  // out
+  size_t num_memories;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, memories);
 
@@ -1025,8 +1040,8 @@ struct PJRT_Memory_AddressableByDevices_Args {
   size_t struct_size;
   void* priv;
   PJRT_Memory* memory;
-  PJRT_Device** devices;  // out
-  size_t num_devices;     // out
+  PJRT_Device* const* devices;  // out
+  size_t num_devices;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_AddressableByDevices_Args, num_devices);
 
@@ -1114,8 +1129,8 @@ struct PJRT_LoadedExecutable_AddressableDevices_Args {
   size_t struct_size;
   void* priv;
   PJRT_LoadedExecutable* executable;
-  PJRT_Device** addressable_devices;  // out
-  size_t num_addressable_devices;     // out
+  PJRT_Device* const* addressable_devices;  // out
+  size_t num_addressable_devices;           // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_AddressableDevices_Args,
                           num_addressable_devices);
@@ -1259,7 +1274,7 @@ struct PJRT_LoadedExecutable_Execute_Args {
   // Only needs to stay alive for the duration of the Execute call.
   PJRT_ExecuteOptions* options;
   // Execution input of size [`num_devices`, `num_args`].
-  PJRT_Buffer*** argument_lists;
+  PJRT_Buffer* const* const* argument_lists;
   size_t num_devices;
   size_t num_args;
   // Execution output of size [`num_devices`, num_outputs`], where `num_outputs`
@@ -1267,7 +1282,7 @@ struct PJRT_LoadedExecutable_Execute_Args {
   // outer (`PJRT_Buffer***`) and inner lists (`PJRT_Buffer**`) must be
   // allocated and deallocated by the caller. PJRT_Buffer_Destroy must be called
   // on the output PJRT_Buffer*.
-  PJRT_Buffer*** output_lists;  // in/out
+  PJRT_Buffer** const* output_lists;  // in/out
   // If `device_complete_events` isn't nullptr, `device_complete_events` needs
   // to be the same length as `output_lists` (i.e. of length `num_devices`), and
   // each `PJRT_Event` will become ready once the corresponding device execution
@@ -1340,7 +1355,7 @@ struct PJRT_Executable_GetCostAnalysis_Args {
   size_t num_properties;  // out
   // `properties` and any embedded data are owned by and have the same lifetime
   // as `executable`.
-  PJRT_NamedValue* properties;  // out
+  const PJRT_NamedValue* properties;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCostAnalysis_Args, properties);
 
@@ -1389,7 +1404,7 @@ struct PJRT_Executable_OutputMemoryKinds_Args {
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length `num_outputs`.
-  const char** memory_kinds;  // out
+  const char* const* memory_kinds;  // out
   // Has length `num_outputs`.
   const size_t* memory_kind_sizes;  // out
 };
@@ -1829,15 +1844,13 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
 
 // ------------------------------ Device Topology ------------------------------
 
-typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
-
 struct PJRT_TopologyDescription_Create_Args {
   size_t struct_size;
   void* priv;
   const char* topology_name;
   size_t topology_name_size;
   // Extra platform-specific options to create a client.
-  PJRT_NamedValue* create_options;
+  const PJRT_NamedValue* create_options;
   size_t num_options;
   PJRT_TopologyDescription* topology;  // out
 };
@@ -1897,7 +1910,7 @@ struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
   void* priv;
   PJRT_TopologyDescription* topology;
   // Has the same lifetime as topology.
-  PJRT_DeviceDescription** descriptions;  // out
+  PJRT_DeviceDescription* const* descriptions;  // out
   size_t num_descriptions;                // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_GetDeviceDescriptions_Args,
@@ -1939,8 +1952,8 @@ struct PJRT_TopologyDescription_Attributes_Args {
   PJRT_TopologyDescription* topology;
 
   // Only lives as long as topology.
-  PJRT_NamedValue* attributes;  // out
-  size_t num_attributes;        // out
+  const PJRT_NamedValue* attributes;  // out
+  size_t num_attributes;              // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Attributes_Args,
                           num_attributes);
@@ -1955,7 +1968,7 @@ struct PJRT_Compile_Args {
   const PJRT_TopologyDescription* topology;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
-  PJRT_Program* program;
+  const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
   // Serialized CompileOptionsProto
   // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
@@ -2111,10 +2124,12 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateViewOfDeviceBuffer);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Fingerprint);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
 } PJRT_Api;
 
 const size_t PJRT_Api_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_CreateViewOfDeviceBuffer);
+    PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_TopologyDescription);
 
 #undef _PJRT_API_STRUCT_FIELD
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 3ebc3efb26933a..985f67bdf831b8 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -480,7 +480,8 @@ xla::StatusOr<std::vector<PJRT_NamedValue>> ConvertToPjRtNamedValueList(
 }
 
 absl::flat_hash_map<std::string, xla::PjRtValueType>
-ConvertFromPjRtNamedValueList(PJRT_NamedValue* c_value_list, size_t list_size) {
+ConvertFromPjRtNamedValueList(const PJRT_NamedValue* c_value_list,
+                              size_t list_size) {
   absl::flat_hash_map<std::string, xla::PjRtValueType> cpp_value_map;
   for (int i = 0; i < list_size; ++i) {
     const PJRT_NamedValue& c_value = c_value_list[i];
@@ -612,6 +613,16 @@ absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api) {
   return platform_name;
 }
 
+xla::StatusOr<const PJRT_TopologyDescription*> GetTopologyDescription(
+    PJRT_Client* client, const PJRT_Api* api) {
+  PJRT_Client_TopologyDescription_Args args;
+  args.struct_size = PJRT_Client_TopologyDescription_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.client = client;
+  RETURN_STATUS_IF_PJRT_ERROR(api->PJRT_Client_TopologyDescription(&args), api);
+  return args.topology;
+}
+
 PJRT_Chunk ConvertFromCppChunk(xla::PjRtChunk chunk) {
   // `deleter_arg` holds a copy of the original xla::PjRtChunk
   // deleter. The original xla::PjRtChunk `input` releases its ownership
@@ -651,8 +662,8 @@ PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
   return args.device_description;
 }
 
-absl::Span<PJRT_Memory*> GetAddressableMemories(const PJRT_Api* api,
-                                                PJRT_Device* device) {
+absl::Span<PJRT_Memory* const> GetAddressableMemories(const PJRT_Api* api,
+                                                      PJRT_Device* device) {
   PJRT_Device_AddressableMemories_Args args;
   args.struct_size = PJRT_Device_AddressableMemories_Args_STRUCT_SIZE;
   args.priv = nullptr;
@@ -661,6 +672,13 @@ absl::Span<PJRT_Memory*> GetAddressableMemories(const PJRT_Api* api,
   return absl::MakeSpan(args.memories, args.num_memories);
 }
 
+int GetId(const PJRT_Api* api, PJRT_DeviceDescription* device_desc) {
+  PJRT_DeviceDescription_Id_Args args = PJRT_DeviceDescription_Id_Args{
+      PJRT_DeviceDescription_Id_Args_STRUCT_SIZE, nullptr, device_desc};
+  pjrt::LogFatalIfPjrtError(api->PJRT_DeviceDescription_Id(&args), api);
+  return args.id;
+}
+
 static void PjRtValueDeleterCallback(char* value) { delete[] value; }
 
 static PJRT_KeyValueGetCFunc ToKVGetCFunc(
@@ -858,4 +876,26 @@ xla::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
   return shape;
 }
 
+absl::string_view PlatformName(const PJRT_Api* api,
+                               const PJRT_TopologyDescription* topo_desc) {
+  PJRT_TopologyDescription_PlatformName_Args args;
+  args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
+  LogFatalIfPjrtError(api->PJRT_TopologyDescription_PlatformName(&args), api);
+  return {args.platform_name, args.platform_name_size};
+}
+
+absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
+    const PJRT_Api* api, const PJRT_TopologyDescription* topo_desc) {
+  PJRT_TopologyDescription_GetDeviceDescriptions_Args args;
+  args.struct_size =
+      PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
+  LogFatalIfPjrtError(
+      api->PJRT_TopologyDescription_GetDeviceDescriptions(&args), api);
+  return {args.descriptions, args.num_descriptions};
+}
+
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index 7a9e8ba04f8809..fde56b89cacf94 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -142,7 +142,8 @@ xla::StatusOr<std::vector<PJRT_NamedValue>> ConvertToPjRtNamedValueList(
     int api_minor_version);
 
 absl::flat_hash_map<std::string, xla::PjRtValueType>
-ConvertFromPjRtNamedValueList(PJRT_NamedValue* c_value_list, size_t list_size);
+ConvertFromPjRtNamedValueList(const PJRT_NamedValue* c_value_list,
+                              size_t list_size);
 
 // Validates that all entries in value_map have a matching name and type in
 // expected_name_and_type. expected_name_and_type may contain extra entries
@@ -163,6 +164,9 @@ xla::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
 absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api);
 
+xla::StatusOr<const PJRT_TopologyDescription*> GetTopologyDescription(
+    PJRT_Client* client, const PJRT_Api* api);
+
 // Releases `chunk`.
 PJRT_Chunk ConvertFromCppChunk(xla::PjRtChunk chunk);
 
@@ -173,8 +177,10 @@ xla::PjRtChunk ConvertToCppChunk(const PJRT_Chunk& chunk);
 PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
                                              PJRT_Device* device);
 
-absl::Span<PJRT_Memory*> GetAddressableMemories(const PJRT_Api* api,
-                                                PJRT_Device* device);
+absl::Span<PJRT_Memory* const> GetAddressableMemories(const PJRT_Api* api,
+                                                      PJRT_Device* device);
+
+int GetId(const PJRT_Api* api, PJRT_DeviceDescription* device_desc);
 
 using PJRT_KeyValueGetCFunc =
     std::function<PJRT_Error*(PJRT_KeyValueGetCallback_Args* args)>;
@@ -236,6 +242,11 @@ xla::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
                                              size_t num_dims,
                                              PJRT_Buffer_MemoryLayout* layout);
 
+absl::string_view PlatformName(const PJRT_Api* api,
+                               const PJRT_TopologyDescription* topo_desc);
+absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
+    const PJRT_Api* api, const PJRT_TopologyDescription* topo_desc);
+
 }  // namespace pjrt
 
 #endif  // XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 146d4dff7ccfc7..36e9344bd223be 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -170,7 +170,7 @@ class PjrtCApiTest : public PjrtCApiTestBase {
     return args.local_hardware_id;
   }
 
-  absl::Span<PJRT_Device*> GetClientDevices() const {
+  absl::Span<PJRT_Device* const> GetClientDevices() const {
     PJRT_Client_Devices_Args dev_args;
     dev_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
     dev_args.priv = nullptr;
@@ -315,7 +315,7 @@ TEST_F(PjrtCApiTest, ClientProcessIndex) {
 }
 
 TEST_F(PjrtCApiTest, ClientDevices) {
-  absl::Span<PJRT_Device*> devices = GetClientDevices();
+  absl::Span<PJRT_Device* const> devices = GetClientDevices();
 
   ASSERT_FALSE(devices.empty());
   for (auto& device : devices) {
@@ -324,14 +324,15 @@ TEST_F(PjrtCApiTest, ClientDevices) {
 }
 
 TEST_F(PjrtCApiTest, ClientAddressableDevices) {
-  absl::Span<PJRT_Device*> addressable_devices = GetClientAddressableDevices();
+  absl::Span<PJRT_Device* const> addressable_devices =
+      GetClientAddressableDevices();
 
   ASSERT_FALSE(addressable_devices.empty());
   for (auto& device : addressable_devices) {
     ASSERT_TRUE(this->IsValidDeviceId(device));
   }
 
-  absl::Span<PJRT_Device*> client_devices = GetClientDevices();
+  absl::Span<PJRT_Device* const> client_devices = GetClientDevices();
   for (auto& addressable_device : addressable_devices) {
     ASSERT_THAT(client_devices, ::testing::Contains(addressable_device));
   }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index f7245c9973fb17..c7d43ca5c495d6 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -68,7 +68,8 @@ void PjrtCApiTestBase::destroy_client(PJRT_Client* client) {
   CHECK_EQ(error, nullptr);
 }
 
-absl::Span<PJRT_Device*> PjrtCApiTestBase::GetClientAddressableDevices() const {
+absl::Span<PJRT_Device* const> PjrtCApiTestBase::GetClientAddressableDevices()
+    const {
   PJRT_Client_AddressableDevices_Args addr_args;
   addr_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
   addr_args.priv = nullptr;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
index cb967756bc5510..35201f307eba2f 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -39,7 +39,7 @@ class PjrtCApiTestBase : public ::testing::Test {
   PJRT_Client* client_;
   void destroy_client(PJRT_Client* client);
 
-  absl::Span<PJRT_Device*> GetClientAddressableDevices() const;
+  absl::Span<PJRT_Device* const> GetClientAddressableDevices() const;
 
   PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
       const std::vector<float>& data, const xla::Shape& shape,
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 7ba96e3efa7fa9..16050218b328bf 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -375,6 +375,17 @@ PJRT_Error* PJRT_Client_PlatformVersion(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Client_TopologyDescription(
+    PJRT_Client_TopologyDescription_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Client_TopologyDescription_Args",
+      PJRT_Client_TopologyDescription_Args_STRUCT_SIZE, args->struct_size));
+
+  PJRT_RETURN_IF_ERROR(args->client->topology.status());
+  args->topology = args->client->topology->get();
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Client_Devices_Args", PJRT_Client_Devices_Args_STRUCT_SIZE,
@@ -519,7 +530,7 @@ using ProgramVariant =
 xla::StatusOr<
     std::variant<mlir::OwningOpRef<mlir::ModuleOp>, xla::XlaComputation>>
 ParsePjrtProgram(std::optional<mlir::MLIRContext>& context,
-                 PJRT_Program* program) {
+                 const PJRT_Program* program) {
   auto format_str = absl::string_view(program->format, program->format_size);
   auto module_str = absl::string_view(program->code, program->code_size);
 
@@ -1310,7 +1321,7 @@ static void CRecvCallbackListsToCpp(
 }
 
 static std::vector<std::vector<xla::PjRtBuffer*>> Convert2DCBuffersToCppBuffers(
-    PJRT_Buffer*** c_lists, size_t outer_size, size_t inner_size) {
+    PJRT_Buffer* const* const* c_lists, size_t outer_size, size_t inner_size) {
   std::vector<std::vector<xla::PjRtBuffer*>> cpp_lists;
   cpp_lists.reserve(outer_size);
   for (int i = 0; i < outer_size; ++i) {
@@ -2136,8 +2147,19 @@ static void AttachDevicesAndMemories(PJRT_Client* c_client) {
   }
 }
 
+static xla::StatusOr<std::unique_ptr<PJRT_TopologyDescription>>
+GetStatusOrTopologyDescription(const xla::PjRtClient& cpp_client) {
+  xla::StatusOr<const xla::PjRtTopologyDescription*> status_or_cpp_topo =
+      cpp_client.GetTopologyDescription();
+  if (!status_or_cpp_topo.ok()) {
+    return status_or_cpp_topo.status();
+  }
+  return std::unique_ptr<PJRT_TopologyDescription>(
+      CreateWrapperDeviceTopology(*status_or_cpp_topo));
+}
+
 PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client) {
-  PJRT_Client* c_client = new PJRT_Client{std::move(cpp_client)};
+  PJRT_Client* c_client = new PJRT_Client(std::move(cpp_client));
   PopulatePjrtClientDevices(c_client);
   PopulatePjrtClientMemories(c_client);
   AttachDevicesAndMemories(c_client);
@@ -2145,9 +2167,9 @@ PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client) {
 }
 
 PJRT_TopologyDescription* CreateWrapperDeviceTopology(
-    std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology) {
+    const xla::PjRtTopologyDescription* cpp_topology) {
   PJRT_TopologyDescription* c_topology =
-      new PJRT_TopologyDescription{std::move(cpp_topology)};
+      new PJRT_TopologyDescription{/*owned_topology=*/nullptr, cpp_topology};
   c_topology->cpp_descriptions = c_topology->topology->DeviceDescriptions();
   c_topology->descriptions.reserve(c_topology->cpp_descriptions.size());
   c_topology->description_pointers.reserve(c_topology->cpp_descriptions.size());
@@ -2164,8 +2186,20 @@ PJRT_TopologyDescription* CreateWrapperDeviceTopology(
   return c_topology;
 }
 
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology) {
+  PJRT_TopologyDescription* topo_desc =
+      CreateWrapperDeviceTopology(cpp_topology.get());
+  topo_desc->owned_topology = std::move(cpp_topology);
+  return topo_desc;
+}
+
 }  // namespace pjrt
 
+PJRT_Client::PJRT_Client(std::unique_ptr<xla::PjRtClient> cpp_client)
+    : client(std::move(cpp_client)),
+      topology(pjrt::GetStatusOrTopologyDescription(*client)) {}
+
 PJRT_Executable::PJRT_Executable(
     std::shared_ptr<xla::PjRtExecutable> executable)
     : executable(std::move(executable)),
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index acd0a26ea7ee22..0caa851fe18dcc 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -41,6 +41,21 @@ struct PJRT_Error {
   xla::Status status;
 };
 
+struct PJRT_TopologyDescription {
+  // nullptr iff the PjRtTopologyDescription isn't owned by the caller. The PJRT
+  // C API sometimes returns a topo desc that's owned by the caller and must be
+  // freed using PJRT_TopologyDescription_Destroy
+  // (e.g. PJRT_TopologyDescription_Create), and sometimes returns a topo desc
+  // that's owned by something else (e.g. PJRT_Client_TopologyDescription).
+  std::unique_ptr<xla::PjRtTopologyDescription> owned_topology;
+  const xla::PjRtTopologyDescription* topology;
+  std::vector<std::unique_ptr<const xla::PjRtDeviceDescription>>
+      cpp_descriptions;
+  std::vector<PJRT_DeviceDescription> descriptions;
+  std::vector<PJRT_DeviceDescription*> description_pointers;
+  std::vector<PJRT_NamedValue> attributes;
+};
+
 struct PJRT_Client {
   std::unique_ptr<xla::PjRtClient> client;
   std::vector<PJRT_Device> owned_devices;
@@ -62,6 +77,9 @@ struct PJRT_Client {
   // `owned_memories`.
   absl::flat_hash_map<xla::PjRtMemorySpace*, PJRT_Memory*>
       c_memory_from_cpp_memory;
+  xla::StatusOr<std::unique_ptr<PJRT_TopologyDescription>> topology;
+
+  explicit PJRT_Client(std::unique_ptr<xla::PjRtClient> cpp_client);
 };
 
 // PJRT_DeviceDescriptions are owned by their corresponding PJRT_Device.
@@ -173,15 +191,6 @@ struct PJRT_SerializedTopology {
   std::string serialized;
 };
 
-struct PJRT_TopologyDescription {
-  std::unique_ptr<xla::PjRtTopologyDescription> topology;
-  std::vector<std::unique_ptr<const xla::PjRtDeviceDescription>>
-      cpp_descriptions;
-  std::vector<PJRT_DeviceDescription> descriptions;
-  std::vector<PJRT_DeviceDescription*> description_pointers;
-  std::vector<PJRT_NamedValue> attributes;
-};
-
 struct PJRT_TransferMetadata {
   // Decompose xla::Shape into C API type fields, without any Tuple information.
   // TODO(b/238999986) support other `xla::Shape` fields when they are fully
@@ -212,6 +221,8 @@ PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
 PJRT_Error* PJRT_Client_PlatformName(PJRT_Client_PlatformName_Args* args);
 PJRT_Error* PJRT_Client_ProcessIndex(PJRT_Client_ProcessIndex_Args* args);
 PJRT_Error* PJRT_Client_PlatformVersion(PJRT_Client_PlatformVersion_Args* args);
+PJRT_Error* PJRT_Client_TopologyDescription(
+    PJRT_Client_TopologyDescription_Args* args);
 PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
 PJRT_Error* PJRT_Client_AddressableDevices(
     PJRT_Client_AddressableDevices_Args* args);
@@ -380,11 +391,22 @@ PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 std::string ProgramFormatErrorMsg(absl::string_view program_format);
 
 // Creates a C PJRT topology from a C++ PJRT topology.
-// The returned topology is owned by the caller and
-// should be destroyed with PJRT_TopologyDescription_Destroy.
+//
+// The returned topology is owned by the caller and should be destroyed with
+// PJRT_TopologyDescription_Destroy. This can be used to implement functions
+// like PJRT_TopologyDescription_Create that return an owned topo desc.
 PJRT_TopologyDescription* CreateWrapperDeviceTopology(
     std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology);
 
+// Creates a C PJRT topology from a C++ PJRT topology.
+//
+// The returned topology is *not* owned by the caller and should *not* be
+// destroyed with PJRT_TopologyDescription_Destroy. This can be used to
+// implement functions like PJRT_Client_TopologyDescription that return a topo
+// desc owned by something else.
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    const xla::PjRtTopologyDescription* cpp_topology);
+
 // Creates a C PJRT client from a C++ PJRT client and creates C PJRT devices
 // from cpp_client's devices. The returned client is owned by the caller and
 // should be destroyed with PJRT_Client_Destroy.
@@ -569,6 +591,8 @@ constexpr PJRT_Api CreatePjrtApi(
       /*PJRT_Client_CreateViewOfDeviceBuffer=*/
       pjrt::PJRT_Client_CreateViewOfDeviceBuffer,
       /*PJRT_Executable_Fingerprint=*/pjrt::PJRT_Executable_Fingerprint,
+      /*PJRT_Client_TopologyDescription= */
+      pjrt::PJRT_Client_TopologyDescription,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 2e2266db53696a..b954ab4a16b582 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -100,6 +100,14 @@ namespace xla {
 
 // ---------------------------------- Client -----------------------------------
 
+static StatusOr<const PjRtCApiTopologyDescription> InitTopologyDescription(
+    const PJRT_Api* c_api, PJRT_Client* c_client) {
+  StatusOr<const PJRT_TopologyDescription*> c_topo =
+      pjrt::GetTopologyDescription(c_client, c_api);
+  TF_RETURN_IF_ERROR(c_topo.status());
+  return PjRtCApiTopologyDescription(c_api, *c_topo);
+}
+
 PjRtCApiClient::PjRtCApiClient(
     const PJRT_Api* c_api, PJRT_Client* c_client,
     std::unique_ptr<pjrt::PJRT_KeyValueCallbackData> kv_callback_data)
@@ -107,6 +115,7 @@ PjRtCApiClient::PjRtCApiClient(
       c_client_(std::unique_ptr<PJRT_Client, ::pjrt::PJRT_ClientDeleter>(
           c_client, ::pjrt::MakeClientDeleter(c_api))),
       kv_callback_data_(std::move(kv_callback_data)),
+      topo_desc_(InitTopologyDescription(c_api, c_client)),
       // Example platform version string:
       //   PJRT C API
       //   TFRT TPU v2
@@ -430,6 +439,14 @@ PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
       std::make_unique<PjRtCApiLoadedExecutable>(this, c_exec));
 }
 
+StatusOr<const PjRtTopologyDescription*>
+PjRtCApiClient::GetTopologyDescription() const {
+  if (!topo_desc_.ok()) {
+    return topo_desc_.status();
+  }
+  return &(*topo_desc_);
+}
+
 StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
     PjRtBuffer* buffer) {
   // Validate that the buffer's client matches the function call's client, since
@@ -1210,7 +1227,7 @@ static std::vector<std::vector<PJRT_Buffer*>> Convert2DCppBuffersToCBuffers(
 }
 
 static std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
-Convert2DCBuffersToCppBuffers(PJRT_Buffer*** c_lists, size_t outer_size,
+Convert2DCBuffersToCppBuffers(PJRT_Buffer** const* c_lists, size_t outer_size,
                               int inner_size, xla::PjRtCApiClient* client) {
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> ret;
   for (size_t i = 0; i < outer_size; ++i) {
@@ -2029,16 +2046,27 @@ PjRtCApiExternalReference::~PjRtCApiExternalReference() {
 // ------------------------------ Device Topology ------------------------------
 
 PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
-    const PJRT_Api* c_api, PJRT_TopologyDescription* c_topology)
+    const PJRT_Api* c_api, PJRT_TopologyDescription* c_topology, bool owned)
     : compiler_(std::make_unique<PjRtCApiCompiler>(c_api)),
       c_api_(c_api),
-      c_topology_(c_topology, ::pjrt::MakeTopologyDescriptionDeleter(c_api)) {
+      c_topology_(c_topology) {
+  if (owned) {
+    owned_c_topology_ = std::unique_ptr<PJRT_TopologyDescription,
+                                        pjrt::PJRT_TopologyDescriptionDeleter>(
+        c_topology, pjrt::MakeTopologyDescriptionDeleter(c_api));
+  }
   InitAttributes();
 }
 
+PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
+    const PJRT_Api* c_api, const PJRT_TopologyDescription* c_topology)
+    : PjRtCApiTopologyDescription(
+          c_api, const_cast<PJRT_TopologyDescription*>(c_topology),
+          /*owned=*/false) {}
+
 absl::string_view PjRtCApiTopologyDescription::platform_name() const {
   PJRT_TopologyDescription_PlatformName_Args args;
-  args.topology = c_topology_.get();
+  args.topology = c_topology_;
   args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
   args.priv = nullptr;
   pjrt::LogFatalIfPjrtError(
@@ -2050,7 +2078,7 @@ absl::string_view PjRtCApiTopologyDescription::platform_version() const {
   PJRT_TopologyDescription_PlatformVersion_Args args;
   args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.topology = c_topology_.get();
+  args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_TopologyDescription_PlatformVersion(&args), c_api_);
   return absl::string_view(args.platform_version, args.platform_version_size);
@@ -2062,14 +2090,14 @@ PjRtCApiTopologyDescription::DeviceDescriptions() const {
   args.struct_size =
       PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.topology = c_topology_.get();
+  args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_TopologyDescription_GetDeviceDescriptions(&args), c_api_);
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> out;
   out.reserve(args.num_descriptions);
   for (PJRT_DeviceDescription* device_desc :
-       absl::Span<PJRT_DeviceDescription*>(args.descriptions,
-                                           args.num_descriptions)) {
+       absl::Span<PJRT_DeviceDescription* const>(args.descriptions,
+                                                 args.num_descriptions)) {
     out.push_back(
         std::make_unique<PjRtCApiDeviceDescription>(c_api_, device_desc));
   }
@@ -2080,7 +2108,7 @@ StatusOr<std::string> PjRtCApiTopologyDescription::Serialize() const {
   PJRT_TopologyDescription_Serialize_Args args;
   args.struct_size = PJRT_TopologyDescription_Serialize_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.topology = c_topology_.get();
+  args.topology = c_topology_;
   RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_TopologyDescription_Serialize(&args),
                               c_api_);
   auto out = std::string(args.serialized_bytes, args.serialized_bytes_size);
@@ -2092,7 +2120,7 @@ void PjRtCApiTopologyDescription::InitAttributes() {
   PJRT_TopologyDescription_Attributes_Args args;
   args.struct_size = PJRT_TopologyDescription_Attributes_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.topology = c_topology_.get();
+  args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_TopologyDescription_Attributes(&args),
                             c_api_);
   attributes_ =
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 378ec3cf3f5e4f..0f1fa3562172e4 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -165,6 +165,72 @@ class PjRtCApiDevice : public PjRtDevice {
   std::vector<PjRtMemorySpace*> memory_spaces_;
 };
 
+class PjRtCApiCompiler : public PjRtCompiler {
+ public:
+  explicit PjRtCApiCompiler(const PJRT_Api* c_api) : c_api_(c_api) {}
+
+  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+ private:
+  const PJRT_Api* c_api_;
+};
+
+class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
+ public:
+  PjRtCApiTopologyDescription(const PJRT_Api* c_api,
+                              PJRT_TopologyDescription* c_topology, bool owned);
+
+  // Does not take ownership of `c_topology`.
+  PjRtCApiTopologyDescription(const PJRT_Api* c_api,
+                              const PJRT_TopologyDescription* c_topology);
+
+  PjRtPlatformId platform_id() const override {
+    CHECK(false) << "PJRT C API does not support platform_id.";
+  }
+
+  absl::string_view platform_name() const override;
+
+  absl::string_view platform_version() const override;
+
+  std::optional<PjRtCompiler*> compiler() const override {
+    return compiler_.get();
+  }
+
+  PJRT_TopologyDescription* c_topology() const { return c_topology_; }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override;
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  // Returns vendor specific attributes about the topology.
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  std::unique_ptr<PjRtCApiCompiler> compiler_;
+  const PJRT_Api* c_api_;
+  // nullptr iff the PJRT_TopologyDescription isn't owned by this wrapper
+  // (i.e. by the caller).
+  std::unique_ptr<PJRT_TopologyDescription,
+                  ::pjrt::PJRT_TopologyDescriptionDeleter>
+      owned_c_topology_;
+  PJRT_TopologyDescription* c_topology_;
+  // Device specific attributes with corresponding values.
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+
+  // Initializes device specific attributes.
+  void InitAttributes();
+};
+
 class PjRtCApiClient : public PjRtClient {
  public:
   PjRtCApiClient(
@@ -225,6 +291,9 @@ class PjRtCApiClient : public PjRtClient {
         "PJRT C API does not support CreateUninitializedBuffer");
   }
 
+  StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+      const override;
+
   StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override {
@@ -352,6 +421,10 @@ class PjRtCApiClient : public PjRtClient {
   // supported.
   std::vector<PjRtMemorySpace*> addressable_memory_spaces_;
   absl::flat_hash_map<PJRT_Memory*, PjRtCApiMemorySpace*> c_to_cpp_memory_map_;
+  // There may be an error fetching the topology desc via the C API
+  // (e.g. unimplemented). Save the error during client init so we can return it
+  // from GetTopologyDescription().
+  StatusOr<const PjRtCApiTopologyDescription> topo_desc_;
 
   const std::string platform_version_;
   const std::string platform_name_;
@@ -665,67 +738,6 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   void InitDevices();
 };
 
-class PjRtCApiCompiler : public PjRtCompiler {
- public:
-  explicit PjRtCApiCompiler(const PJRT_Api* c_api) : c_api_(c_api) {}
-
-  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
-      CompileOptions options, const XlaComputation& computation,
-      const PjRtTopologyDescription& topology, PjRtClient* client) override;
-
-  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
-      CompileOptions options, mlir::ModuleOp module,
-      const PjRtTopologyDescription& topology, PjRtClient* client) override;
-
- private:
-  const PJRT_Api* c_api_;
-};
-
-class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
- public:
-  PjRtCApiTopologyDescription(const PJRT_Api* c_api,
-                              PJRT_TopologyDescription* c_topology);
-
-  PjRtPlatformId platform_id() const override {
-    CHECK(false) << "PJRT C API does not support platform_id.";
-  }
-
-  absl::string_view platform_name() const override;
-
-  absl::string_view platform_version() const override;
-
-  std::optional<PjRtCompiler*> compiler() const override {
-    return compiler_.get();
-  }
-
-  const PJRT_TopologyDescription* c_topology() const {
-    return c_topology_.get();
-  }
-
-  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
-      const override;
-
-  absl::StatusOr<std::string> Serialize() const override;
-
-  // Returns vendor specific attributes about the topology.
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override {
-    return attributes_;
-  }
-
- private:
-  std::unique_ptr<PjRtCApiCompiler> compiler_;
-  const PJRT_Api* c_api_;
-  std::unique_ptr<PJRT_TopologyDescription,
-                  ::pjrt::PJRT_TopologyDescriptionDeleter>
-      c_topology_;
-  // Device specific attributes with corresponding values.
-  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
-
-  // Initializes device specific attributes.
-  void InitAttributes();
-};
-
 class CApiCopyToDeviceStream : public CopyToDeviceStream {
  public:
   CApiCopyToDeviceStream(PJRT_CopyToDeviceStream* c_stream,
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index c4b73edea0529b..175291de9bd29d 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -598,7 +598,8 @@ class PjRtClient {
   // Gets the pointer to the topology description held by the client.
   virtual StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
       const {
-    return Unimplemented("GetTopologyDescription not supported!");
+    return Unimplemented("GetTopologyDescription not supported on platform %s",
+                         platform_name());
   }
 
   // Returns topology object for compilation based on this client's topology.
diff --git a/third_party/xla/xla/primitive_util.h b/third_party/xla/xla/primitive_util.h
index 7ed8f481b3e18f..63fa4e19359f45 100644
--- a/third_party/xla/xla/primitive_util.h
+++ b/third_party/xla/xla/primitive_util.h
@@ -71,7 +71,7 @@ bool HasInfinity(PrimitiveType type);
 // Returns the XLA primitive type (eg, F32) corresponding to the given
 // template parameter native type (eg, float).
 template <typename NativeT>
-PrimitiveType NativeToPrimitiveType() {
+constexpr PrimitiveType NativeToPrimitiveType() {
   // Make the expression depend on the template parameter NativeT so
   // that this compile-time error only appears if this function is
   // instantiated with some concrete type that is not specialized
@@ -82,119 +82,118 @@ PrimitiveType NativeToPrimitiveType() {
 }
 
 // Declarations of specializations for each native type which correspond to a
-// XLA primitive type.  As an optimization, these are declared inline in the
-// header.
+// XLA primitive type.
 template <>
-inline PrimitiveType NativeToPrimitiveType<bool>() {
+constexpr PrimitiveType NativeToPrimitiveType<bool>() {
   return PRED;
 }
 
 // Unsigned integer
 template <>
-inline PrimitiveType NativeToPrimitiveType<u4>() {
+constexpr PrimitiveType NativeToPrimitiveType<u4>() {
   return U4;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<uint8_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<uint8_t>() {
   return U8;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<uint16_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<uint16_t>() {
   return U16;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<uint32_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<uint32_t>() {
   return U32;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<uint64_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<uint64_t>() {
   return U64;
 }
 
 // Signed integer
 template <>
-inline PrimitiveType NativeToPrimitiveType<s4>() {
+constexpr PrimitiveType NativeToPrimitiveType<s4>() {
   return S4;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<int8_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<int8_t>() {
   return S8;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<int16_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<int16_t>() {
   return S16;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<int32_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<int32_t>() {
   return S32;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<int64_t>() {
+constexpr PrimitiveType NativeToPrimitiveType<int64_t>() {
   return S64;
 }
 
 // Floating point
 template <>
-inline PrimitiveType NativeToPrimitiveType<float>() {
+constexpr PrimitiveType NativeToPrimitiveType<float>() {
   return F32;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<double>() {
+constexpr PrimitiveType NativeToPrimitiveType<double>() {
   return F64;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<half>() {
+constexpr PrimitiveType NativeToPrimitiveType<half>() {
   return F16;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<bfloat16>() {
+constexpr PrimitiveType NativeToPrimitiveType<bfloat16>() {
   return BF16;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2>() {
   return F8E5M2;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
   return F8E4M3FN;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11>() {
   return F8E4M3B11FNUZ;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2fnuz>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2fnuz>() {
   return F8E5M2FNUZ;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fnuz>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fnuz>() {
   return F8E4M3FNUZ;
 }
 
 // Complex
 template <>
-inline PrimitiveType NativeToPrimitiveType<complex64>() {
+constexpr PrimitiveType NativeToPrimitiveType<complex64>() {
   return C64;
 }
 
 template <>
-inline PrimitiveType NativeToPrimitiveType<complex128>() {
+constexpr PrimitiveType NativeToPrimitiveType<complex128>() {
   return C128;
 }
 
diff --git a/third_party/xla/xla/python/ifrt/dtype.h b/third_party/xla/xla/python/ifrt/dtype.h
index 8d8088f0c4aff6..184e3597f56f1e 100644
--- a/third_party/xla/xla/python/ifrt/dtype.h
+++ b/third_party/xla/xla/python/ifrt/dtype.h
@@ -72,11 +72,11 @@ class DType {
     // dtype will have empty dimensions.
     kToken = 17,
 
-    kF8E4M3FN = 19,
+    kF8E4M3FN = 20,
     kF8E4M3B11FNUZ = 23,
-    kF8E4M3FNUZ = 24,
-    kF8E5M2 = 20,
-    kF8E5M2FNUZ = 25,
+    kF8E4M3FNUZ = 25,
+    kF8E5M2 = 19,
+    kF8E5M2FNUZ = 24,
 
     // Next = 26
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 452262b22c7105..92c079f56e0928 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -43,31 +44,36 @@ char PjRtArray::ID = 0;
 
 StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype) {
   switch (dtype.kind()) {
-    case DType::kInvalid:
-    case DType::kPred:
-    case DType::kS4:
-    case DType::kS8:
-    case DType::kS16:
-    case DType::kS32:
-    case DType::kS64:
-    case DType::kU4:
-    case DType::kU8:
-    case DType::kU16:
-    case DType::kU32:
-    case DType::kU64:
-    case DType::kF8E4M3FN:
-    case DType::kF8E4M3B11FNUZ:
-    case DType::kF8E4M3FNUZ:
-    case DType::kF8E5M2:
-    case DType::kF8E5M2FNUZ:
-    case DType::kF16:
-    case DType::kF32:
-    case DType::kBF16:
-    case DType::kF64:
-    case DType::kC64:
-    case DType::kC128:
-    case DType::kToken:
-      return static_cast<xla::PrimitiveType>(static_cast<int>(dtype.kind()));
+#define CASE(DT, PT)                                                      \
+  case DT:                                                                \
+    static_assert(PT ==                                                   \
+                  static_cast<xla::PrimitiveType>(static_cast<int>(DT))); \
+    return PT
+    CASE(DType::kInvalid, xla::PrimitiveType::PRIMITIVE_TYPE_INVALID);
+    CASE(DType::kPred, xla::PrimitiveType::PRED);
+    CASE(DType::kS4, xla::PrimitiveType::S4);
+    CASE(DType::kS8, xla::PrimitiveType::S8);
+    CASE(DType::kS16, xla::PrimitiveType::S16);
+    CASE(DType::kS32, xla::PrimitiveType::S32);
+    CASE(DType::kS64, xla::PrimitiveType::S64);
+    CASE(DType::kU4, xla::PrimitiveType::U4);
+    CASE(DType::kU8, xla::PrimitiveType::U8);
+    CASE(DType::kU16, xla::PrimitiveType::U16);
+    CASE(DType::kU32, xla::PrimitiveType::U32);
+    CASE(DType::kU64, xla::PrimitiveType::U64);
+    CASE(DType::kF8E4M3FN, xla::PrimitiveType::F8E4M3FN);
+    CASE(DType::kF8E4M3B11FNUZ, xla::PrimitiveType::F8E4M3B11FNUZ);
+    CASE(DType::kF8E4M3FNUZ, xla::PrimitiveType::F8E4M3FNUZ);
+    CASE(DType::kF8E5M2, xla::PrimitiveType::F8E5M2);
+    CASE(DType::kF8E5M2FNUZ, xla::PrimitiveType::F8E5M2FNUZ);
+    CASE(DType::kF16, xla::PrimitiveType::F16);
+    CASE(DType::kF32, xla::PrimitiveType::F32);
+    CASE(DType::kBF16, xla::PrimitiveType::BF16);
+    CASE(DType::kF64, xla::PrimitiveType::F64);
+    CASE(DType::kC64, xla::PrimitiveType::C64);
+    CASE(DType::kC128, xla::PrimitiveType::C128);
+    CASE(DType::kToken, xla::PrimitiveType::TOKEN);
+#undef CASE
     case DType::kString:
       return InvalidArgument("Not supported as XLA PrimitiveType: %d",
                              static_cast<int>(dtype.kind()));
@@ -381,35 +387,55 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
             "first fetched to the host and then sent to the destination "
             "device.");
       }
-      // Use `PjRtBuffer::CopyToMemorySpace` instead of
+      if (new_sharding_has_memory_kind && memories_supported &&
+          semantics == ArrayCopySemantics::kDonateInput && !memory_kind_equal) {
+        return Unimplemented(
+            "Donation across different memory kinds is not implemented.");
+      }
+      // Try using `PjRtBuffer::CopyToMemorySpace` instead of
       // `PjRtBuffer::CopyToDevice` when memories are supported. Because the
       // semantics of the latter one is to copy to the default memory space of
       // the device.
+      std::unique_ptr<PjRtBuffer> copied_buffer;
       if (new_sharding_has_memory_kind && memories_supported) {
         TF_ASSIGN_OR_RETURN(
             auto memory_space,
             GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
                                          canonicalized_sharding_memory_kind));
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
-                            pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
-        if (semantics == ArrayCopySemantics::kDonateInput) {
-          if (!memory_kind_equal) {
-            return Unimplemented(
-                "Donation across different memory kinds is not implemented.");
+        StatusOr<std::unique_ptr<PjRtBuffer>> copied_buffer_using_memory_space =
+            pjrt_buffers_[i]->CopyToMemorySpace(memory_space);
+        if (copied_buffer_using_memory_space.ok()) {
+          copied_buffer = std::move(*copied_buffer_using_memory_space);
+        } else if (!absl::IsUnimplemented(
+                       copied_buffer_using_memory_space.status())) {
+          return copied_buffer_using_memory_space.status();
+        } else {
+          // Returns unimplemented if the sharding's memory space isn't the
+          // device's default memory space. Otherwise continue on to the
+          // CopyToDevice fallback.
+          // TODO(b/307743645): clean up this branch when memory space is better
+          // supported.
+          TF_ASSIGN_OR_RETURN(
+              PjRtMemorySpace * default_memory_space,
+              new_sharding->devices()[i]->default_memory_space());
+          if (canonicalized_sharding_memory_kind.memory_kind() !=
+              default_memory_space->memory_space_kind()) {
+            return copied_buffer_using_memory_space.status();
           }
-          pjrt_buffers_[i] = nullptr;
-        }
-        buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
-      } else {
-        // Use `PjRtBuffer::CopyToDevice` when memories are not supported.
-        TF_ASSIGN_OR_RETURN(
-            std::unique_ptr<xla::PjRtBuffer> copied_buffer,
-            pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
-        if (semantics == ArrayCopySemantics::kDonateInput) {
-          pjrt_buffers_[i] = nullptr;
         }
-        buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
       }
+      // Fallback to `PjRtBuffer::CopyToDevice` if (1) memories are not
+      // supported or (2) `PjRtBuffer::CopyToMemorySpace` returns unimplemented
+      // and canonicalized_sharding_memory_kind is the same as the
+      // default_memory_space of `new_sharding->devices()[i]`.
+      if (copied_buffer == nullptr) {
+        TF_ASSIGN_OR_RETURN(copied_buffer, pjrt_buffers_[i]->CopyToDevice(
+                                               new_sharding->devices()[i]));
+      }
+      if (semantics == ArrayCopySemantics::kDonateInput) {
+        pjrt_buffers_[i] = nullptr;
+      }
+      buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
     }
   }
   return PjRtArray::Create(client_, dtype_, shape_, std::move(new_sharding),
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index 625bcbabf5d2ea..6905540f9307e2 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -206,6 +206,8 @@ PyHostSendAndRecvLoadedHostCallback::PyHostSendAndRecvLoadedHostCallback(
 PyHostSendAndRecvLoadedHostCallback::~PyHostSendAndRecvLoadedHostCallback() {
   GlobalPyRefManager()->AddGarbage(
       absl::MakeSpan(static_cast<pybind11::object*>(&callable_), 1));
+  GlobalPyRefManager()->AddGarbage(
+      absl::MakeSpan(static_cast<pybind11::object*>(&serializer_), 1));
 }
 
 StatusOr<std::string> PyHostSendAndRecvLoadedHostCallback::Serialize() const {
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index 5b5e0cca710904..1a47247bc5a21e 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -637,6 +637,9 @@ std::unique_ptr<PyTreeDef> PyTreeDef::Compose(const PyTreeDef& inner) const {
         "PyTree registries of PyTreeDefs passed to Compose() must match.");
   }
   auto out = std::make_unique<PyTreeDef>(registry_->shared_from_this());
+  out->traversal_.reserve(static_cast<size_t>(num_leaves()) *
+                              inner.num_nodes() +
+                          num_nodes() - num_leaves());
   for (const Node& n : traversal_) {
     if (n.kind == PyTreeKind::kLeaf) {
       absl::c_copy(inner.traversal_, std::back_inserter(out->traversal_));
@@ -644,13 +647,7 @@ std::unique_ptr<PyTreeDef> PyTreeDef::Compose(const PyTreeDef& inner) const {
       out->traversal_.push_back(n);
     }
   }
-  const auto& root = traversal_.back();
-  const auto& inner_root = inner.traversal_.back();
-  // TODO(tomhennigan): This should update all nodes in the traversal.
-  auto& out_root = out->traversal_.back();
-  out_root.num_nodes = (root.num_nodes - root.num_leaves) +
-                       (inner_root.num_nodes * root.num_leaves);
-  out_root.num_leaves *= inner_root.num_leaves;
+  out->SetNumLeavesAndNumNodes();
   return out;
 }
 
diff --git a/third_party/xla/xla/python/pytree_test.py b/third_party/xla/xla/python/pytree_test.py
index 8c80ec5012b146..2bba921799e4ed 100644
--- a/third_party/xla/xla/python/pytree_test.py
+++ b/third_party/xla/xla/python/pytree_test.py
@@ -87,6 +87,11 @@ def testRoundtripNodeData(self):
     self.roundtrip_node_data(ExampleType(field0=o, field1=o))
     self.roundtrip_node_data(ExampleType2(field0=o, field1=o))
 
+  def testCompose(self):
+    x = registry.flatten(0)[1]
+    y = registry.flatten((0, 0))[1]
+    self.assertEqual((x.compose(y)).num_leaves, 2)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 3d61b9a6188f6c..788b4cf4d444a9 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -47,7 +47,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 207
+_version = 209
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index 5b85d74d394372..11cfb5f333da78 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -109,7 +109,7 @@ def get_topology_for_devices(devices: List[Device]) -> DeviceTopology:
   ...
 
 
-def make_tpu_client(library_path: Optional[str]) -> Client:
+def make_tpu_client(library_path: Optional[str] = None) -> Client:
   ...
 
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 3e69749aa7fecc..2e8505470158b5 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -522,6 +522,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":constant_value",
+        ":hlo_dce",
         ":hlo_pass",
         ":value_range",
         "//xla:comparison_util",
@@ -574,6 +575,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -1675,6 +1677,7 @@ cc_library(
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1774,6 +1777,7 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":hlo_ordering",
         ":hlo_proto_cc",
+        ":time_utils",
         ":tuple_points_to_analysis",
         "//xla:comparison_util",
         "//xla:status",
@@ -5411,6 +5415,7 @@ xla_test(
     ],
     deps = [
         ":hlo_parser",
+        "//xla:error_spec",
         "//xla:execution_options_util",
         "//xla:status_macros",
         "//xla:test",
@@ -5418,6 +5423,7 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -7406,3 +7412,11 @@ cc_library(
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
+
+cc_library(
+    name = "time_utils",
+    srcs = ["time_utils.cc"],
+    hdrs = ["time_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [],
+)
diff --git a/third_party/xla/xla/service/all_reduce_promotion.cc b/third_party/xla/xla/service/all_reduce_promotion.cc
index 00469a2a6c9e78..30965128a81524 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.cc
+++ b/third_party/xla/xla/service/all_reduce_promotion.cc
@@ -49,6 +49,7 @@ std::unique_ptr<HloInstruction> CloneAllReduce(
     return inst->GetModule()->AddEmbeddedComputation(promoted.Build());
   }();
   new_inst->set_to_apply(to_apply_promoted);
+  to_apply_promoted->SetCollectiveCallInstruction(new_inst.get());
   return new_inst;
 }
 
diff --git a/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc b/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc
index d13ee37390b985..b400b28cf8824c 100644
--- a/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc
+++ b/third_party/xla/xla/service/bitcast_dtypes_expander_test.cc
@@ -45,26 +45,26 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: HloModule bitcast_to_smaller
 // CHECK: %xla.bitcast_convert_s32_10__2_s8_10_4_.17 (a.1: s32[10]) -> s8[10,4] {
-// CHECK:  %a.1 = s32[10]{0} parameter(0)
-// CHECK:  %reshape.2 = s32[10,1]{1,0} reshape(s32[10]{0} %a.1)
-// CHECK:  %broadcast.3 = s32[10,1]{1,0} broadcast(s32[10,1]{1,0} %reshape.2), dimensions={0,1}
-// CHECK:  %reshape.4 = s32[10]{0} reshape(s32[10,1]{1,0} %broadcast.3)
-// CHECK:  %broadcast.5 = s32[10,4]{1,0} broadcast(s32[10]{0} %reshape.4), dimensions={0}
-// CHECK:  %bitcast-convert.6 = u32[10,4]{1,0} bitcast-convert(s32[10,4]{1,0} %broadcast.5)
-// CHECK:  %constant.8 = u32[] constant(8)
-// CHECK:  %broadcast.9 = u32[10,4]{1,0} broadcast(u32[] %constant.8), dimensions={}
-// CHECK:  %iota.7 = u32[10,4]{1,0} iota(), iota_dimension=1
-// CHECK:  %multiply.10 = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %broadcast.9, u32[10,4]{1,0} %iota.7)
-// CHECK:  %shift-right-logical{{\.?[0-9]*}} = u32[10,4]{1,0} shift-right-logical(u32[10,4]{1,0} %bitcast-convert.6, u32[10,4]{1,0} %multiply.10)
-// CHECK:  %constant{{\.?[0-9]*}} = u32[] constant(255)
-// CHECK:  %broadcast.13 = u32[10,4]{1,0} broadcast(u32[] %constant{{\.?[0-9]*}}), dimensions={}
-// CHECK:  %and.14 = u32[10,4]{1,0} and(u32[10,4]{1,0} %shift-right-logical{{\.?[0-9]*}}, u32[10,4]{1,0} %broadcast.13)
-// CHECK:  %convert.15 = u8[10,4]{1,0} convert(u32[10,4]{1,0} %and.14)
-// CHECK:  ROOT %bitcast-convert.16 = s8[10,4]{1,0} bitcast-convert(u8[10,4]{1,0} %convert.15)
+// CHECK:  %[[VAL_0:.*]] = s32[10]{0} parameter(0)
+// CHECK:  %[[VAL_1:.*]] = s32[10,1]{1,0} reshape(s32[10]{0} %[[VAL_0]])
+// CHECK:  %[[VAL_2:.*]] = s32[10,1]{1,0} broadcast(s32[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
+// CHECK:  %[[VAL_3:.*]] = s32[10]{0} reshape(s32[10,1]{1,0} %[[VAL_2]])
+// CHECK:  %[[VAL_4:.*]] = s32[10,4]{1,0} broadcast(s32[10]{0} %[[VAL_3]]), dimensions={0}
+// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} bitcast-convert(s32[10,4]{1,0} %[[VAL_4]])
+// CHECK:  %[[VAL_6:.*]] = u32[] constant(8)
+// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_6]]), dimensions={}
+// CHECK:  %[[VAL_8:.*]] = u32[10,4]{1,0} iota(), iota_dimension=1
+// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %[[VAL_7]], u32[10,4]{1,0} %[[VAL_8]])
+// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-right-logical(u32[10,4]{1,0} %[[VAL_5]], u32[10,4]{1,0} %[[VAL_9]])
+// CHECK:  %[[VAL_11:.*]] = u32[] constant(255)
+// CHECK:  %[[VAL_12:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_11]]), dimensions={}
+// CHECK:  %[[VAL_13:.*]] = u32[10,4]{1,0} and(u32[10,4]{1,0} %[[VAL_10]], u32[10,4]{1,0} %[[VAL_12]])
+// CHECK:  %[[VAL_14:.*]] = u8[10,4]{1,0} convert(u32[10,4]{1,0} %[[VAL_13]])
+// CHECK:  ROOT %[[VAL_15:.*]] = s8[10,4]{1,0} bitcast-convert(u8[10,4]{1,0} %[[VAL_14]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s32[10]) -> s8[10,4] {
-// CHECK:  %p = s32[10]{0} parameter(0)
-// CHECK:  ROOT %call = s8[10,4]{1,0} call(s32[10]{0} %p), to_apply=%xla.bitcast_convert_s32_10__2_s8_10_4_.17
+// CHECK:  %[[VAL_16:.*]] = s32[10]{0} parameter(0)
+// CHECK:  ROOT %[[VAL_17:.*]] = s8[10,4]{1,0} call(s32[10]{0} %[[VAL_16]]), to_apply=%[[VAL_18:.*]]
 // CHECK: }
 )"));
 }
@@ -88,26 +88,26 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: HloModule bitcast_to_smaller, entry_computation_layout={(s64[10]{0})->s32[10,2]{1,0}}
 // CHECK: %xla.bitcast_convert_s64_10__2_s32_10_2_.17 (a.1: s64[10]) -> s32[10,2] {
-// CHECK:   %a.1 = s64[10]{0} parameter(0)
-// CHECK:   %reshape.2 = s64[10,1]{1,0} reshape(s64[10]{0} %a.1)
-// CHECK:   %broadcast.3 = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %reshape.2), dimensions={0,1}
-// CHECK:   %reshape.4 = s64[10]{0} reshape(s64[10,1]{1,0} %broadcast.3)
-// CHECK:   %broadcast.5 = s64[10,2]{1,0} broadcast(s64[10]{0} %reshape.4), dimensions={0}
-// CHECK:   %bitcast-convert.6 = u64[10,2]{1,0} bitcast-convert(s64[10,2]{1,0} %broadcast.5)
-// CHECK:   %constant.8 = u64[] constant(32)
-// CHECK:   %broadcast.9 = u64[10,2]{1,0} broadcast(u64[] %constant.8), dimensions={}
-// CHECK:   %iota.7 = u64[10,2]{1,0} iota(), iota_dimension=1
-// CHECK:   %multiply.10 = u64[10,2]{1,0} multiply(u64[10,2]{1,0} %broadcast.9, u64[10,2]{1,0} %iota.7)
-// CHECK:   %shift-right-logical{{\.?[0-9]*}} = u64[10,2]{1,0} shift-right-logical(u64[10,2]{1,0} %bitcast-convert.6, u64[10,2]{1,0} %multiply.10)
-// CHECK:   %constant{{\.?[0-9]*}} = u64[] constant(4294967295)
-// CHECK:   %broadcast.13 = u64[10,2]{1,0} broadcast(u64[] %constant{{\.?[0-9]*}}), dimensions={}
-// CHECK:   %and.14 = u64[10,2]{1,0} and(u64[10,2]{1,0} %shift-right-logical{{\.?[0-9]*}}, u64[10,2]{1,0} %broadcast.13)
-// CHECK:   %convert.15 = u32[10,2]{1,0} convert(u64[10,2]{1,0} %and.14)
-// CHECK:   ROOT %bitcast-convert.16 = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %convert.15)
+// CHECK:   %[[VAL_0:.*]] = s64[10]{0} parameter(0)
+// CHECK:   %[[VAL_1:.*]] = s64[10,1]{1,0} reshape(s64[10]{0} %[[VAL_0]])
+// CHECK:   %[[VAL_2:.*]] = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
+// CHECK:   %[[VAL_3:.*]] = s64[10]{0} reshape(s64[10,1]{1,0} %[[VAL_2]])
+// CHECK:   %[[VAL_4:.*]] = s64[10,2]{1,0} broadcast(s64[10]{0} %[[VAL_3]]), dimensions={0}
+// CHECK:   %[[VAL_5:.*]] = u64[10,2]{1,0} bitcast-convert(s64[10,2]{1,0} %[[VAL_4]])
+// CHECK:   %[[VAL_6:.*]] = u64[] constant(32)
+// CHECK:   %[[VAL_7:.*]] = u64[10,2]{1,0} broadcast(u64[] %[[VAL_6]]), dimensions={}
+// CHECK:   %[[VAL_8:.*]] = u64[10,2]{1,0} iota(), iota_dimension=1
+// CHECK:   %[[VAL_9:.*]] = u64[10,2]{1,0} multiply(u64[10,2]{1,0} %[[VAL_7]], u64[10,2]{1,0} %[[VAL_8]])
+// CHECK:   %[[VAL_10:.*]] = u64[10,2]{1,0} shift-right-logical(u64[10,2]{1,0} %[[VAL_5]], u64[10,2]{1,0} %[[VAL_9]])
+// CHECK:   %[[VAL_11:.*]] = u64[] constant(4294967295)
+// CHECK:   %[[VAL_12:.*]] = u64[10,2]{1,0} broadcast(u64[] %[[VAL_11]]), dimensions={}
+// CHECK:   %[[VAL_13:.*]] = u64[10,2]{1,0} and(u64[10,2]{1,0} %[[VAL_10]], u64[10,2]{1,0} %[[VAL_12]])
+// CHECK:   %[[VAL_14:.*]] = u32[10,2]{1,0} convert(u64[10,2]{1,0} %[[VAL_13]])
+// CHECK:   ROOT %[[VAL_15:.*]] = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %[[VAL_14]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s64[10]) -> s32[10,2] {
-// CHECK:   %p = s64[10]{0} parameter(0)
-// CHECK:   ROOT %call = s32[10,2]{1,0} call(s64[10]{0} %p), to_apply=%xla.bitcast_convert_s64_10__2_s32_10_2_.17
+// CHECK:   %[[VAL_16:.*]] = s64[10]{0} parameter(0)
+// CHECK:   ROOT %[[VAL_17:.*]] = s32[10,2]{1,0} call(s64[10]{0} %[[VAL_16]]), to_apply=%[[VAL_18:.*]]
 // CHECK: }
 )"));
 }
@@ -132,22 +132,27 @@ ENTRY main {
   EXPECT_TRUE(changed);
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: HloModule bitcast_to_larger
+// CHECK: %or_U32.10 (lhs.11: u32[], rhs.12: u32[]) -> u32[] {
+// CHECK:  %[[VAL_0:.*]] = u32[] parameter(0)
+// CHECK:  %[[VAL_1:.*]] = u32[] parameter(1)
+// CHECK:  ROOT %[[VAL_2:.*]] = u32[] or(u32[] %[[VAL_0]], u32[] %[[VAL_1]])
+// CHECK: }
 // CHECK: %xla.bitcast_convert_s8_10_4__2_s32_10_.16 (a.1: s8[10,4]) -> s32[10] {
-// CHECK:  %a.1 = s8[10,4]{1,0} parameter(0)
-// CHECK:  %bitcast-convert.2 = u8[10,4]{1,0} bitcast-convert(s8[10,4]{1,0} %a.1)
-// CHECK:  %convert.3 = u32[10,4]{1,0} convert(u8[10,4]{1,0} %bitcast-convert.2)
-// CHECK:  %constant{{\.?[0-9]*}} = u32[] constant(8)
-// CHECK:  %broadcast.6 = u32[10,4]{1,0} broadcast(u32[] %constant{{\.?[0-9]*}}), dimensions={}
-// CHECK:  %iota{{\.?[0-9]*}} = u32[10,4]{1,0} iota(), iota_dimension=1
-// CHECK:  %multiply.7 = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %broadcast.6, u32[10,4]{1,0} %iota{{\.?[0-9]*}})
-// CHECK:  %shift-left.8 = u32[10,4]{1,0} shift-left(u32[10,4]{1,0} %convert.3, u32[10,4]{1,0} %multiply.7)
-// CHECK:  %constant.9 = u32[] constant(0)
-// CHECK:  %reduce.14 = u32[10]{0} reduce(u32[10,4]{1,0} %shift-left.8, u32[] %constant.9), dimensions={1}, to_apply=%or_U32.10
-// CHECK:  ROOT %bitcast-convert.15 = s32[10]{0} bitcast-convert(u32[10]{0} %reduce.14)
+// CHECK:  %[[VAL_3:.*]] = s8[10,4]{1,0} parameter(0)
+// CHECK:  %[[VAL_4:.*]] = u8[10,4]{1,0} bitcast-convert(s8[10,4]{1,0} %[[VAL_3]])
+// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} convert(u8[10,4]{1,0} %[[VAL_4]])
+// CHECK:  %[[VAL_6:.*]] = u32[] constant(8)
+// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_6]]), dimensions={}
+// CHECK:  %[[VAL_8:.*]] = u32[10,4]{1,0} iota(), iota_dimension=1
+// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %[[VAL_7]], u32[10,4]{1,0} %[[VAL_8]])
+// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-left(u32[10,4]{1,0} %[[VAL_5]], u32[10,4]{1,0} %[[VAL_9]])
+// CHECK:  %[[VAL_11:.*]] = u32[] constant(0)
+// CHECK:  %[[VAL_12:.*]] = u32[10]{0} reduce(u32[10,4]{1,0} %[[VAL_10]], u32[] %[[VAL_11]]), dimensions={1}, to_apply=%[[VAL_13:.*]]
+// CHECK:  ROOT %[[VAL_14:.*]] = s32[10]{0} bitcast-convert(u32[10]{0} %[[VAL_12]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s8[10,4]) -> s32[10] {
-// CHECK:  %p = s8[10,4]{1,0} parameter(0)
-// CHECK:  ROOT %call = s32[10]{0} call(s8[10,4]{1,0} %p), to_apply=%xla.bitcast_convert_s8_10_4__2_s32_10_.16
+// CHECK:  %[[VAL_15:.*]] = s8[10,4]{1,0} parameter(0)
+// CHECK:  ROOT %[[VAL_16:.*]] = s32[10]{0} call(s8[10,4]{1,0} %[[VAL_15]]), to_apply=%[[VAL_17:.*]]
 // CHECK: }
 )"));
 }
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 3235e5d1d5ee23..84663e904a9c9b 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -707,8 +708,14 @@ void BufferAssignment::CombineTempAllocations(
   for (size_t index = 0; index < allocations_.size(); ++index) {
     BufferAllocation* allocation = &allocations_[index];
     allocation->set_index(index);
+    std::vector<const HloValue*> sorted_values;
+    sorted_values.reserve(allocation->assigned_buffers_.size());
     for (const auto& buffer_offset_size : allocation->assigned_buffers_) {
       const HloValue* value = buffer_offset_size.first;
+      sorted_values.emplace(sorted_values.end(), value);
+    }
+    absl::c_sort(sorted_values, &CompareHloValuesById);
+    for (const HloValue* value : sorted_values) {
       allocation_index_for_value_[value] = index;
     }
   }
@@ -1189,7 +1196,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
       for (const auto& buffer_offset_size : allocation->assigned_buffers()) {
         const HloValue* value = buffer_offset_size.first;
         if ((*must_not_live_out_)(value->instruction(), value->index())) {
-          VLOG(4) << "Can't assign: " << buffer_offset_size.first->instruction()
+          VLOG(4) << "Can't assign: " << value->instruction()
                   << " cannot live out of the module";
           return false;
         }
@@ -1650,8 +1657,14 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
                                    buffers_to_assign.end());
     }
     auto color_map = SplitBuffersByColor(all_buffers_to_assign);
+    std::vector<LogicalBuffer::Color> sorted_colors;
+    sorted_colors.reserve(color_map.size());
     for (auto& single_colored_set : color_map) {
       auto color = single_colored_set.first;
+      sorted_colors.emplace(sorted_colors.end(), color);
+    }
+    absl::c_sort(sorted_colors);
+    for (auto color : sorted_colors) {
       VLOG(2) << "Simulating heap for color " << color;
       int64_t alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
@@ -1666,7 +1679,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         // whole-module heap simulation. Performing heap simulation from the
         // private stack computation allows better temporal reuse of buffers.
         auto computation_map = SplitBuffersByPrivateStackComputation(
-            single_colored_set.second, private_stacks_it->second,
+            color_map[color], private_stacks_it->second,
             assignment->alias_analysis().dataflow_analysis().call_graph());
         for (const HloComputation* private_stack_computation :
              private_stacks_it->second) {
@@ -1684,19 +1697,19 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
                   get_heap_algorithm(alignment), *private_stack_computation,
                   *instruction_sequence, assignment->alias_analysis(),
                   assignment->buffer_size_, &schedule, options));
-          AssignBuffersFromHeapSimulator(
-              result, assignment, single_colored_set.first, isolation_options);
+          AssignBuffersFromHeapSimulator(result, assignment, color,
+                                         isolation_options);
         }
       } else {
-        options.buffers_to_assign = &single_colored_set.second;
+        options.buffers_to_assign = &color_map[color];
         TF_ASSIGN_OR_RETURN(
             HeapSimulator::Result<HloValue> result,
             HeapSimulator::Run(get_heap_algorithm(alignment),
                                assignment->module(), schedule,
                                assignment->alias_analysis(),
                                assignment->buffer_size_, options));
-        AssignBuffersFromHeapSimulator(
-            result, assignment, single_colored_set.first, isolation_options);
+        AssignBuffersFromHeapSimulator(result, assignment, color,
+                                       isolation_options);
       }
     }
   } else {
@@ -1711,20 +1724,26 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       auto color_map = SplitBuffersByColor(buffers_to_assign);
+      std::vector<LogicalBuffer::Color> sorted_colors;
+      sorted_colors.reserve(color_map.size());
       for (auto& single_colored_set : color_map) {
         auto color = single_colored_set.first;
+        sorted_colors.emplace(sorted_colors.end(), color);
+      }
+      absl::c_sort(sorted_colors);
+      for (auto color : sorted_colors) {
         VLOG(2) << "Simulating heap for color " << color;
         int64_t alignment = assignment->color_alignment_(color);
         HeapSimulator::Options options;
-        options.buffers_to_assign = &single_colored_set.second;
+        options.buffers_to_assign = &color_map[color];
         TF_ASSIGN_OR_RETURN(
             HeapSimulator::Result<HloValue> result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
                                *instruction_sequence,
                                assignment->alias_analysis(),
                                assignment->buffer_size_, options));
-        AssignBuffersFromHeapSimulator(
-            result, assignment, single_colored_set.first, isolation_options);
+        AssignBuffersFromHeapSimulator(result, assignment, color,
+                                       isolation_options);
       }
     }
   }
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 72def1e13b0804..3d5ddf374d32b6 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/map_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/constant_value.h"
+#include "xla/service/hlo_dce.h"
 #include "xla/service/value_range.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -2347,6 +2348,11 @@ StatusOr<bool> CollectivePipeliner::Run(
           << " and transformed instructions: " << transformed_instructions
           << " for pipelining direction: "
           << GetPipelineDirectionString(config_.pipelining_direction);
+  // Run necessary cleanup to make sure unused code doesn't trigger HloVerifier.
+  if (changed) {
+    TF_RETURN_IF_ERROR(HloDCE().Run(module, execution_threads).status());
+  }
+
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 7e40e436839488..efdd4f95bf69d1 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -1951,5 +1951,68 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
 }
 
+TEST_F(CollectivePipelinerTest, PipelinedReduceScatterCanPassVerifier) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+to_apply0 {
+  Arg_0.732 = bf16[] parameter(0)
+  Arg_1.733 = bf16[] parameter(1)
+  ROOT add.734 = bf16[] add(Arg_0.732, Arg_1.733)
+}
+
+body {
+  p2 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte2 = bf16[3,4096,4096]{2,1,0} get-tuple-element(p2), index=1
+  gte3 = bf16[10,512,3,4096]{3,2,1,0} get-tuple-element(p2), index=2
+  c2 = s32[] constant(9)
+  gte4 = s32[] get-tuple-element(p2), index=0
+  sub0 = s32[] subtract(c2, gte4)
+  c3 = s32[] constant(0)
+  comp1 = pred[] compare(sub0, c3), direction=LT
+  c4 = s32[] constant(19)
+  sub2 = s32[] subtract(c4, gte4)
+  sel0 = s32[] select(comp1, sub2, sub0)
+
+  rsp0 = bf16[3,4096,4096]{2,1,0} reshape(gte2)
+  rs0 = bf16[3,4096,512]{2,1,0} reduce-scatter(rsp0), channel_id=75, replica_groups={{0,1,2,3}}, dimensions={2}, to_apply=to_apply0
+  tran0 = bf16[512,3,4096]{0,2,1} transpose(rs0), dimensions={2,0,1}
+  rsp1 = bf16[1,512,3,4096]{3,2,1,0} reshape(tran0)
+  dus0 = bf16[10,512,3,4096]{3,2,1,0} dynamic-update-slice(gte3, rsp1, sel0, c3, c3, /*index=5*/c3)
+  c5 = s32[] constant(1)
+  add0 = s32[] add(gte4, c5)
+  ROOT t1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(add0, rsp0, dus0)
+} // body
+
+condition {
+  cond_p1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte1 = s32[] get-tuple-element(cond_p1), index=0
+  c1 = s32[] constant(9)
+  ROOT comp0 = pred[] compare(gte1, c1), direction=LT
+}
+
+ENTRY main.3813_spmd {
+  p0 = bf16[3,4096,4096]{2,1,0} parameter(0)
+  p1 = bf16[10,512,3,4096]{3,2,1,0} parameter(1)
+  c0 = s32[] constant(0)
+
+  t0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(c0, p0, p1)
+  w0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) while(t0), condition=condition, body=body
+  ROOT gte0 = bf16[3,4096,4096]{2,1,0} get-tuple-element(w0), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
+                           /*pipeline_use_tree=*/true,
+                           /*process_different_sized_ops=*/true,
+                           CollectivePipeliner::PipeliningDirection::kForward,
+                           HloPredicateIsOp<HloOpcode::kReduceScatter>)
+                  .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(module.get()).status());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/dot_merger.cc b/third_party/xla/xla/service/dot_merger.cc
index de290642c5158d..020ade5b62b5f2 100644
--- a/third_party/xla/xla/service/dot_merger.cc
+++ b/third_party/xla/xla/service/dot_merger.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <set>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/service/graphcycles/graphcycles.h"
@@ -304,10 +305,18 @@ StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge) {
   // them earlier because removing an instruction deletes it; we'd then have
   // dangling pointers in our hashtable!)
   absl::flat_hash_set<HloInstruction*> dead_instrs;
+  std::vector<HloInstruction*> keys;
+  keys.reserve(equivalence_classes.size());
   for (auto& kv : equivalence_classes) {
+    keys.push_back(kv.first);
+  }
+  absl::c_sort(keys, [](const HloInstruction* a, const HloInstruction* b) {
+    return a->unique_id() < b->unique_id();
+  });
+  for (auto key : keys) {
+    const auto& values = equivalence_classes[key];
     // For determinism, iterate in order of the instructions' IDs.
-    absl::InlinedVector<HloInstruction*, 16> dots(kv.second.begin(),
-                                                  kv.second.end());
+    absl::InlinedVector<HloInstruction*, 16> dots(values.begin(), values.end());
     absl::c_sort(dots, [](const HloInstruction* a, const HloInstruction* b) {
       return a->unique_id() < b->unique_id();
     });
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index 65a09e9fcbb165..a490f9af998cc2 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <queue>
 #include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallString.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/hlo_proto_util.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/lib/io/zlib_compression_options.h"
 #include "tsl/lib/io/zlib_outputbuffer.h"
@@ -472,7 +474,7 @@ static std::vector<std::string> DumpHloModuleImpl(
       if (!rendered_graph.ok()) {
         VLOG(1) << "Skipping fusion visualization"
                 << " for computation " << computation->name()
-                << " due to: " << rendered_graph.status().ToString();
+                << " due to: " << rendered_graph.status();
         continue;
       }
       file_paths.push_back(DumpToFileInDirImpl(
@@ -634,7 +636,10 @@ void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
 
 void DumpProtobufToFile(const tsl::protobuf::Message& proto,
                         const DebugOptions& debug_options,
-                        absl::string_view filename) {
+                        absl::string_view filename,
+                        absl::AnyInvocable<StatusOr<std::string>(
+                            tsl::Env*, const tsl::protobuf::Message&)>
+                            text_formatter) {
   CanonicalDebugOptions opts(debug_options);
   tsl::Env* env = tsl::Env::Default();
   const std::string& dir = opts.dump_to;
@@ -642,31 +647,45 @@ void DumpProtobufToFile(const tsl::protobuf::Message& proto,
     auto status = env->RecursivelyCreateDir(dir);
     if (!status.ok()) {
       LOG(ERROR) << "Could not create directory " << dir
-                 << " for dumping XLA execution options: " << status;
+                 << " for dumping: " << status;
       return;
     }
   }
-  if (env->IsDirectory(dir).ok()) {
-    const std::string path = tsl::io::JoinPath(dir, filename);
-    Status status;
-    if (opts.dump_as_text) {
-      status = tsl::WriteTextProto(env, absl::StrCat(path, ".txt"), proto);
+  if (!env->IsDirectory(dir).ok()) {
+    return;
+  }
+  const std::string path = tsl::io::JoinPath(dir, filename);
+  Status status;
+  if (opts.dump_as_text) {
+    if (text_formatter) {
+      auto written_proto = text_formatter(env, proto);
+      if (!written_proto.status().ok()) {
+        LOG(ERROR) << "Failure with custom proto text formatting function. "
+                   << "Could not write XLA data to " << filename << ": "
+                   << written_proto.status();
+        return;
+      }
+      status = tsl::WriteStringToFile(env, absl::StrCat(path, ".txt"),
+                                      written_proto.value());
     } else {
-      status = tsl::WriteBinaryProto(env, absl::StrCat(path, ".pb"), proto);
-    }
-    if (!status.ok()) {
-      LOG(ERROR) << "Could not write XLA debug data to " << filename << ": "
-                 << status;
+      status = tsl::WriteTextProto(env, absl::StrCat(path, ".txt"), proto);
     }
+  } else {
+    status = tsl::WriteBinaryProto(env, absl::StrCat(path, ".pb"), proto);
+  }
+  if (!status.ok()) {
+    LOG(ERROR) << "Could not write XLA data to " << filename << ": " << status;
   }
 }
 
-void DumpPerModuleProtobufToFile(const HloModule& module,
-                                 const tsl::protobuf::Message& proto,
-                                 const DebugOptions& debug_options,
-                                 absl::string_view name) {
+void DumpPerModuleProtobufToFile(
+    const HloModule& module, const tsl::protobuf::Message& proto,
+    const DebugOptions& debug_options, absl::string_view name,
+    absl::AnyInvocable<StatusOr<std::string>(tsl::Env*,
+                                             const tsl::protobuf::Message&)>
+        text_formatter) {
   const std::string filename = FilenameFor(module, TimestampFor(module), name);
-  DumpProtobufToFile(proto, debug_options, filename);
+  DumpProtobufToFile(proto, debug_options, filename, std::move(text_formatter));
 }
 
 void DumpHloModuleIfEnabled(const HloModule& module, string_view name) {
diff --git a/third_party/xla/xla/service/dump.h b/third_party/xla/xla/service/dump.h
index 6e1cab91716086..86244ba4b73edb 100644
--- a/third_party/xla/xla/service/dump.h
+++ b/third_party/xla/xla/service/dump.h
@@ -83,17 +83,23 @@ void DumpToFileInDirOrStdout(const HloModule& module,
 
 // Dumps the given protobuf to the given filename if dumping is enabled.
 // Exactly where and in what formats it's dumped is determined by the debug
-// options.
+// options. Allows for an optional custom serialization function to be used for
+// added customization.
 void DumpProtobufToFile(const tsl::protobuf::Message& proto,
                         const DebugOptions& debug_options,
-                        absl::string_view filename);
+                        absl::string_view filename,
+                        absl::AnyInvocable<StatusOr<std::string>(
+                            tsl::Env*, const tsl::protobuf::Message&)>
+                            text_formatter = nullptr);
 
 // Similar to above, but the filename depends on module's information and the
-// given name.
-void DumpPerModuleProtobufToFile(const HloModule& module,
-                                 const tsl::protobuf::Message& proto,
-                                 const DebugOptions& debug_options,
-                                 absl::string_view name);
+// given name. Also allows for the optional serialization function.
+void DumpPerModuleProtobufToFile(
+    const HloModule& module, const tsl::protobuf::Message& proto,
+    const DebugOptions& debug_options, absl::string_view name,
+    absl::AnyInvocable<StatusOr<std::string>(tsl::Env*,
+                                             const tsl::protobuf::Message&)>
+        text_formatter = nullptr);
 
 // Dumps the given HLO module if dumping is enabled for the module. Exactly
 // where and in what formats it's dumped is determined by the module's config.
diff --git a/third_party/xla/xla/service/elemental_ir_emitter_test.cc b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
index 9ca6f0a918ba6b..9823d9101eb5f4 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter_test.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
 #include "xla/execution_options_util.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/status_macros.h"
@@ -48,6 +50,18 @@ class ElementalIrEmitterExecutionTest : public HloTestBase {
   }
 };
 
+class ElementalIrEmitterExecutionTestWithoutFastMinMax
+    : public ElementalIrEmitterExecutionTest {
+ protected:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        ElementalIrEmitterExecutionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_cpu_enable_fast_min_max(false);
+    debug_options.set_xla_gpu_enable_fast_min_max(false);
+    return debug_options;
+  }
+};
+
 XLA_TEST_F(ElementalIrEmitterExecutionTest, DotFusion) {
   const std::string hlo_text = R"(
 HloModule FusedDot
@@ -669,5 +683,133 @@ XLA_TEST_F(ElementalIrEmitterExecutionTest, IotaF8E5FNUZ) {
   RunTest(hlo_text, {});
 }
 
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MinimumHandlesNaNsOnTheLeft) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  ROOT min = f32[5,5] minimum(nans, neg1s)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MinimumHandlesNaNsOnTheRight) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  ROOT min = f32[5,5] minimum(neg1s, nans)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MaximumHandlesNaNsOnTheLeft) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  ROOT max = f32[5,5] maximum(nans, neg1s)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MaximumHandlesNaNsOnTheRight) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  ROOT max = f32[5,5] maximum(neg1s, nans)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MinimumReturnsLHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  ROOT min = f32[5,5] minimum(zeros, ones)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MinimumReturnsRHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  ROOT min = f32[5,5] minimum(ones, zeros)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MaximumReturnsLHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  ROOT max = f32[5,5] maximum(ones, zeros)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
+           MaximumReturnsRHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  ROOT max = f32[5,5] maximum(zeros, ones)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/float_normalization.cc b/third_party/xla/xla/service/float_normalization.cc
index 267a92383aab17..84774a3b4884ed 100644
--- a/third_party/xla/xla/service/float_normalization.cc
+++ b/third_party/xla/xla/service/float_normalization.cc
@@ -335,6 +335,9 @@ Status FloatNormalizationVisitor::HandleMultipleOutputs(HloInstruction* hlo) {
 
   std::vector<HloComputation*> low_precision_called_comps;
   for (auto* comp : hlo->called_computations()) {
+    if (comp->IsCollectiveCalledComputation()) {
+      continue;
+    }
     bool comp_has_low_precision = false;
     if (comp->root_instruction()->shape().element_type() ==
         HighPrecisionType()) {
@@ -411,6 +414,9 @@ Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
 
   std::vector<HloComputation*> low_precision_called_comps;
   for (auto* comp : hlo->called_computations()) {
+    if (comp->IsCollectiveCalledComputation()) {
+      continue;
+    }
     bool comp_has_low_precision = false;
     high_prec_count += CountSubshapesWithMatchingType(
         comp->root_instruction()->shape(), HighPrecisionType());
@@ -549,6 +555,11 @@ StatusOr<bool> FloatNormalization::Run(
                         ", before:\n" + module->ToString());
   FloatNormalizationVisitor visitor(float_support_, this);
   for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
+    if (comp->IsCollectiveCalledComputation()) {
+      XLA_VLOG_LINES(2, "Skip processing collective called computation: " +
+                            comp->ToString());
+      continue;
+    }
     TF_RETURN_IF_ERROR(comp->Accept(&visitor));
   }
   XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
diff --git a/third_party/xla/xla/service/float_normalization_test.cc b/third_party/xla/xla/service/float_normalization_test.cc
index 3a41960bad932e..2d6a976ff59df4 100644
--- a/third_party/xla/xla/service/float_normalization_test.cc
+++ b/third_party/xla/xla/service/float_normalization_test.cc
@@ -76,6 +76,38 @@ class TestFloatSupport : public FloatSupport {
   }
 };
 
+// The test float class that doesn't support any compute ops for low-precision
+// but supports some collectives.
+class TestFloatNoComputeSupport : public FloatSupport {
+ public:
+  explicit TestFloatNoComputeSupport(PrimitiveType low_precision_type)
+      : FloatSupport(low_precision_type) {}
+  ~TestFloatNoComputeSupport() override = default;
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    if (hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kAllToAll ||
+        hlo.opcode() == HloOpcode::kAllReduce ||
+        hlo.opcode() == HloOpcode::kReduceScatter) {
+      return true;
+    }
+    return false;
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    if (hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kAllToAll ||
+        hlo.opcode() == HloOpcode::kAllReduce ||
+        hlo.opcode() == HloOpcode::kReduceScatter) {
+      return true;
+    }
+    return false;
+  }
+};
+
 class FloatNormalizationTest : public HloTestBase {
  protected:
   FloatNormalizationTest()
@@ -485,4 +517,135 @@ TEST_F(FloatNormalizationTest, ResolveIfUnsupportedF8e5m2) {
   EXPECT_EQ(mul1->operand(0)->opcode(), HloOpcode::kConvert);
 }
 
+class FloatNormalizationNoComputeSupportTest : public FloatNormalizationTest {
+ protected:
+  bool Normalize(HloModule* module, PrimitiveType low_precision_type = BF16) {
+    TestFloatNoComputeSupport float_support(low_precision_type);
+    FloatNormalization normalization(&float_support);
+
+    StatusOr<bool> result = normalization.Run(module);
+    EXPECT_IS_OK(result.status());
+
+    HloVerifier verifier(/*layout_sensitive=*/false,
+                         /*allow_mixed_precision=*/true);
+    EXPECT_IS_OK(verifier.Run(module).status());
+
+    return result.value();
+  }
+};
+
+TEST_F(FloatNormalizationNoComputeSupportTest,
+       NoNormalizationForToApplyMultiOuputAllReduce) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(BF16, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(BF16, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(BF16, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape bf16_shape_a = ShapeUtil::MakeShape(BF16, {2, 4});
+  Shape bf16_shape_b = ShapeUtil::MakeShape(BF16, {16, 16});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape_a, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape_b, "b"));
+
+  HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
+      ShapeUtil::MakeTupleShape({bf16_shape_a, bf16_shape_b}), {a, b},
+      reduction,
+      /*replica_groups=*/{},
+      /*constrain_layout=*/false,
+      /*channel_id=*/std::nullopt,
+      /*use_global_device_ids=*/false));
+  builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(bf16_shape_b, crs, 1));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+  // Since we skip processing to_apply region, nothing should change in the
+  // original HLO.
+  EXPECT_FALSE(Normalize(module.get()));
+  EXPECT_EQ(computation->root_instruction()->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(1)->shape().element_type(), BF16);
+  EXPECT_EQ(crs->to_apply()->root_instruction()->opcode(), HloOpcode::kAdd);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), BF16);
+}
+
+TEST_F(FloatNormalizationNoComputeSupportTest,
+       NoNormalizationForToApplyAllReduce) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(BF16, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(BF16, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(BF16, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape bf16_shape_a = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape_a, "a"));
+
+  HloInstruction* crs = builder.AddInstruction(
+      HloInstruction::CreateAllReduce(bf16_shape_a, {a}, reduction,
+                                      /*replica_groups=*/{},
+                                      /*constrain_layout=*/false,
+                                      /*channel_id=*/std::nullopt,
+                                      /*use_global_device_ids=*/false));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+  // Since we skip processing to_apply region, nothing should change in the
+  // original HLO.
+  EXPECT_FALSE(Normalize(module.get()));
+  EXPECT_EQ(computation->root_instruction()->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(0)->shape().element_type(), BF16);
+  EXPECT_EQ(crs->to_apply()->root_instruction()->opcode(), HloOpcode::kAdd);
+}
+
+TEST_F(FloatNormalizationNoComputeSupportTest,
+       NoNormalizationForToApplyReduceScatter) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(BF16, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(BF16, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(BF16, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape bf16_shape_a = ShapeUtil::MakeShape(BF16, {2, 4});
+  Shape bf16_shape_scattered = ShapeUtil::MakeShape(BF16, {1, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape_a, "a"));
+
+  HloInstruction* crs =
+      builder.AddInstruction(HloInstruction::CreateReduceScatter(
+          bf16_shape_scattered, {a}, reduction,
+          /*replica_groups=*/{},
+          /*constrain_layout=*/false,
+          /*channel_id=*/std::nullopt,
+          /*use_global_device_ids=*/false, /*scatter_dimension*/ 0));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+  // Since we skip processing to_apply region, nothing should change in the
+  // original HLO.
+  EXPECT_FALSE(Normalize(module.get()));
+  EXPECT_EQ(computation->root_instruction()->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(0)->shape().element_type(), BF16);
+  EXPECT_EQ(crs->to_apply()->root_instruction()->opcode(), HloOpcode::kAdd);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 237fa745314c45..ab97252204912a 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3,6 +3,7 @@
 
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
 load(
     "//xla/service/gpu:build_defs.bzl",
@@ -260,13 +261,14 @@ cc_library(
     name = "ir_emitter_unnested",
     srcs = ["ir_emitter_unnested.cc"],
     hdrs = ["ir_emitter_unnested.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]) + 
-                    if_rocm_hipblaslt(["TF_HIPBLASLT=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]) + if_rocm_hipblaslt([
+        "TF_HIPBLASLT=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
-        ":fft_thunk",
         ":gemm_thunk",
         ":gpu_asm_opts_util",
         ":gpu_constants",
@@ -309,6 +311,7 @@ cc_library(
         "//xla/service/gpu/fusions:thunk_util",
         "//xla/service/gpu/fusions:tiling_util",
         "//xla/service/gpu/runtime3:custom_call_thunk",
+        "//xla/service/gpu/runtime3:fft_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -352,7 +355,7 @@ cc_library(
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ] + if_gpu_is_configured([
         ":cub_sort_thunk",
-        ":cublas_lt_matmul_thunk",
+        ":gpublas_lt_matmul_thunk",
         ":ir_emitter_triton",
         "//xla/service/gpu/runtime3:cholesky_thunk",
         "//xla/service/gpu/runtime3:triangular_solve_thunk",
@@ -506,6 +509,7 @@ xla_test(
         ":gpu_device_info_for_tests",
         ":ir_emission_utils",
         ":ir_emitter_triton",
+        ":matmul_utils",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:status_macros",
@@ -607,6 +611,7 @@ cc_library(
         ":gpu_fusible",
         ":instruction_fusion",
         ":ir_emission_utils",
+        ":matmul_utils",
         ":split_k_gemm_rewriter",
         ":stream_executor_util",
         "@com_google_absl//absl/algorithm:container",
@@ -638,6 +643,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:redzone_allocator",
+        "@local_tsl//tsl/lib/core:bits",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -663,6 +669,7 @@ xla_test(
         ":autotuner_util",
         ":backend_configs_cc",
         ":gemm_rewriter_triton",
+        ":matmul_utils",
         ":triton_autotuner",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
@@ -934,7 +941,6 @@ cc_library(
         ":backend_configs_cc",
         ":buffer_allocations",
         ":cusolver_context",
-        ":fft_thunk",
         ":gemm_thunk",
         ":gpu_asm_opts_util",
         ":gpu_constants",
@@ -979,6 +985,7 @@ cc_library(
         "//xla/service/gpu/runtime:executable",
         "//xla/service/gpu/runtime:support",
         "//xla/service/gpu/runtime3:custom_call_thunk",
+        "//xla/service/gpu/runtime3:fft_thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
@@ -1113,73 +1120,65 @@ cc_library(
     ],
 )
 
+cuda_library(
+    name = "gpu_prim_cuda",
+    hdrs = ["gpu_prim_cuda.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:bfloat16",
+    ] + if_cuda_is_configured(xla_cub_deps()),
+)
+
+cc_library(
+    name = "gpu_prim_rocm",
+    hdrs = ["gpu_prim_rocm.h"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:bfloat16",
+    ] + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocprim",
+    ]),
+)
+
 cc_library(
     name = "cub_sort_thunk",
     srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]),
     hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]),
-    visibility = ["//visibility:public"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
+    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":buffer_allocations",
         ":thunk",
         "@com_google_absl//absl/log:check",
-        ":gpu_prim_hdrs",
         "//xla/service:buffer_assignment",
         "//xla:shape_util",
         "//xla/stream_executor:device_memory",
         "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
+        "@local_tsl//tsl/platform:errors",
     ] + [":cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]),
 )
 
-cc_library(
-    name = "gpu_prim_hdrs",
-    hdrs = ["gpu_prim.h"],
-    deps = [
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:bfloat16",
-    ] +    
-    if_cuda_is_configured([
-        "@local_config_cuda//cuda:cub_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
-)
-
 build_cub_sort_kernels(
     name = "cub_sort_kernel",
     srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]),
     hdrs = if_gpu_is_configured(["cub_sort_kernel.h"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     types = get_cub_sort_kernel_types(),
     deps = if_gpu_is_configured([
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        ":gpu_prim_hdrs",
-    ]),
-)
-
-cc_library(
-    name = "fft_thunk",
-    srcs = ["fft_thunk.cc"],
-    hdrs = ["fft_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":buffer_allocations",
-        ":thunk",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ],
+    ]) + if_cuda_is_configured([":gpu_prim_cuda"]) + if_rocm_is_configured([":gpu_prim_rocm"]),
 )
 
 cc_library(
@@ -1194,6 +1193,7 @@ cc_library(
         ":ir_emission_utils",
         ":matmul_utils",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
@@ -1207,6 +1207,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -1309,6 +1310,7 @@ xla_cc_test(
     srcs = ["split_k_gemm_rewriter_test.cc"],
     deps = [
         ":gemm_rewriter_triton",
+        ":matmul_utils",
         ":split_k_gemm_rewriter",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
@@ -1378,13 +1380,9 @@ cc_library(
 )
 
 cc_library(
-    name = "cublas_lt_matmul_thunk",
-    srcs = if_cuda_is_configured(["cublas_lt_matmul_thunk.cc"]) + if_rocm_is_configured([
-        "cublas_lt_matmul_thunk.cc",
-    ]),
-    hdrs = if_cuda_is_configured(["cublas_lt_matmul_thunk.h"]) + if_rocm_is_configured([
-        "cublas_lt_matmul_thunk.h",
-    ]),
+    name = "gpublas_lt_matmul_thunk",
+    srcs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.cc"]),
+    hdrs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.h"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
@@ -1398,11 +1396,6 @@ cc_library(
         "//xla/stream_executor",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-    ]) + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cublas_lt_header",
-        "//xla/stream_executor/cuda:cublas_plugin",
-    ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:hipblas_lt_header",
     ]),
 )
 
@@ -1549,7 +1542,9 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils",
+        "//xla:autotuning_proto_cc",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1562,6 +1557,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -1809,6 +1805,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_sort_rewriter",
+    srcs = if_cuda_is_configured(["gpu_sort_rewriter.cc"]),
+    hdrs = if_cuda_is_configured(["gpu_sort_rewriter.h"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cub_sort_thunk",
+        ":cublas_cudnn",
+        "//xla:comparison_util",
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "move_copy_to_users",
     srcs = ["move_copy_to_users.cc"],
@@ -1859,6 +1879,27 @@ xla_cc_test(
     ],
 )
 
+xla_test(
+    name = "gpu_sort_rewriter_test",
+    srcs = if_cuda_is_configured(["gpu_sort_rewriter_test.cc"]),
+    backends = ["gpu"],
+    tags = ["no_oss"],
+    deps = [
+        ":cublas_cudnn",
+        ":gpu_sort_rewriter",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 cc_library(
     name = "cusolver_context",
     srcs = if_gpu_is_configured(["cusolver_context.cc"]),
@@ -2448,6 +2489,7 @@ cc_library(
     hdrs = ["gpu_all_gather_optimizer.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -2485,7 +2527,6 @@ cc_library(
         ":buffer_sharing",
         ":executable_proto_cc",
         ":gpu_constants",
-        ":gpu_convert_async_collectives_to_sync",
         ":gpu_executable",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
@@ -2514,6 +2555,7 @@ cc_library(
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "//xla/translate/mhlo_to_lhlo_with_xla",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:Support",
@@ -2816,6 +2858,7 @@ cc_library(
     ]) + xla_export_hlo_deps() + [
         ":fusion_pipeline",
         ":prepare_hlo_for_ir_emitting_pipeline",
+        "//xla:status",
         "@local_tsl//tsl/lib/monitoring:counter",
     ],
 )
@@ -2900,6 +2943,7 @@ cc_library(
         ":gpu_conv_rewriter",
         ":gpu_executable",
         ":gpu_layout_assignment",
+        ":gpu_sort_rewriter",
         ":ir_emission_utils",
         ":metrics",
         ":target_constants",
@@ -3194,8 +3238,13 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:buffer_value",
         "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_pass_pipeline",
         "//xla/service:latency_hiding_scheduler",
@@ -3203,6 +3252,11 @@ cc_library(
         "//xla/service:profile_guided_latency_estimator",
         "//xla/service/gpu/model:analytical_latency_estimator",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -3220,13 +3274,23 @@ xla_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_hlo_schedule",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:backend",
         "//xla/service:gpu_plugin",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_ordering",
         "//xla/stream_executor:device_description",
+        "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
     ],
 )
@@ -3384,9 +3448,12 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        ":hlo_fusion_analysis",
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -4377,12 +4444,15 @@ cc_library(
         ":gpu_constants",
         ":ir_emission_utils",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:buffer_assignment",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
index ed91d4952c9a11..69659373cf8115 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
@@ -88,7 +88,7 @@ AutotunerCompileUtil::AutotunerCompileUtil(const AutotuneConfig& config,
   // Avoid using another thread pool.
   opts_.set_xla_gpu_force_compilation_parallelism(1);
   // Avoid using GPU graphs as we don't want to measure graph construction time.
-  opts_.set_xla_gpu_graph_level(0);
+  opts_.clear_xla_gpu_enable_command_buffer();
   // Disable experimental XLA:GPU runtime.
   opts_.set_xla_gpu_enable_gpu2_runtime(false);
   opts_.set_xla_embed_ir_in_executable(false);
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index a113dc70227919..1c90e1ac11c9fc 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -94,6 +94,12 @@ message GemmBackendConfig {
   }
 
   Epilogue epilogue = 13;
+
+  optional int64 lhs_stride = 14;
+  optional int64 rhs_stride = 15;
+
+  optional bool grad_x = 16;
+  optional bool grad_y = 17;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
diff --git a/third_party/xla/xla/service/gpu/buffer_sharing.cc b/third_party/xla/xla/service/gpu/buffer_sharing.cc
index d02cbc7a27e179..64421596dcbd60 100644
--- a/third_party/xla/xla/service/gpu/buffer_sharing.cc
+++ b/third_party/xla/xla/service/gpu/buffer_sharing.cc
@@ -21,13 +21,18 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -35,7 +40,8 @@ namespace gpu {
 std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
                                              const HloInstruction* operand,
                                              const ShapeIndex& user_index) {
-  if (user->opcode() != HloOpcode::kFusion) {
+  const HloFusionInstruction* fusion = DynCast<HloFusionInstruction>(user);
+  if (fusion == nullptr) {
     return std::nullopt;
   }
 
@@ -65,10 +71,21 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
     }
   }
 
+  // Allow multiple output users, if they end in reductions.
+  // This only works for the reduction emitter, as it calculates the reduction
+  // first, i.e. before processing other outputs (that may overwrite the input).
+  stream_executor::GpuDeviceInfoProto device_info;
+  stream_executor::DeviceDescription device_description(device_info);
+  auto analysis = HloFusionAnalysis::Create(fusion, &device_description);
+  bool is_reduction_emitter = analysis->GetEmitterFusionKind() ==
+                              HloFusionAnalysis::EmitterFusionKind::kReduction;
+  const HloInstruction* reduction_hero =
+      is_reduction_emitter ? reduction_hero = analysis->FindHeroReduction()
+                           : nullptr;
+
   // We need to make sure that the fusion parameter is accessed in the same
-  // iteration order as the fusion output. Also, there should not be two fusion
-  // outputs that consume the fusion parameter, because we do not want to share
-  // the same fusion operand with two different fusion outputs. To make sure
+  // iteration order as the fusion output. Also, there should not be any other
+  // fusion output that accesses it in a different iteration order. To make sure
   // that the iteration order is the same, we only allow ops on the path from
   // fusion parameter to fusion output which are elementwise (no copy) or
   // bitcast or an elementwise dynamic update slice (i.e. with the first operand
@@ -88,16 +105,21 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
   q.push(fusion_param);
   visited.insert(fusion_param);
   bool found_path_to_output = false;
+  int reached_root = 0;
   while (!q.empty()) {
     HloInstruction* hlo_operand = q.front();
     q.pop();
     if (hlo_operand == output) {
       found_path_to_output = true;
-      // The output should have at most 1 user: the tuple op (in case of a
-      // multi-output fusion)
-      if (hlo_operand->user_count() > 1) {
+      // We still need to process the users of 'hlo_operand'. There can be other
+      // reduction users in addition to the tuple user.
+      if (hlo_operand->user_count() > 1 && !is_reduction_emitter) {
         return false;
       }
+    }
+    // Reduction emitter processes the reduction first, so the values below it
+    // will not interfere with buffer sharing.
+    if (hlo_operand == reduction_hero) {
       continue;
     }
     for (HloInstruction* hlo : hlo_operand->users()) {
@@ -134,7 +156,8 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
       } else if ((!hlo->IsElementwiseOnOperand(
                       hlo->operand_index(hlo_operand)) ||
                   hlo->opcode() == HloOpcode::kCopy) &&
-                 hlo->opcode() != HloOpcode::kBitcast) {
+                 hlo->opcode() != HloOpcode::kBitcast &&
+                 hlo->opcode() != HloOpcode::kTuple && hlo != reduction_hero) {
         // This check also catches the case that we reach a different fusion
         // output, as that fusion output would have a tuple op as user, which we
         // do not allow here.
@@ -151,9 +174,12 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
           return false;
         }
       }
+      if (hlo->IsRoot()) {
+        ++reached_root;
+      }
     }
   }
-  return found_path_to_output;
+  return found_path_to_output && (user_index.empty() || reached_root == 1);
 }
 
 std::optional<bool> CanShareBufferHint(const HloInstruction* user,
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index f7a9ac14ba6a11..bca5e3bda58d39 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -2,9 +2,7 @@
 """
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("@local_config_rocm//rocm:build_defs.bzl",
-    "rocm_copts", 
-)
+load("@local_config_rocm//rocm:build_defs.bzl", "rocm_copts")  # copybara:comment
 
 def get_cub_sort_kernel_types(name = ""):
     """ List of supported types for CUB sort kernels.
@@ -32,13 +30,13 @@ def get_cub_sort_kernel_types(name = ""):
         "u64_b64",
     ]
 
-def build_cub_sort_kernels(name, types, **kwargs):
+def build_cub_sort_kernels(name, types, local_defines = [], **kwargs):
     """ Create build rules for all CUB sort kernels.
     """
     for suffix in types:
         cuda_library(
             name = name + "_" + suffix,
-            local_defines = ["CUB_TYPE_" + suffix.upper()],
-            copts = rocm_copts(),
+            local_defines = local_defines + ["CUB_TYPE_" + suffix.upper()],
+            copts = rocm_copts(),  # copybara:comment
             **kwargs
         )
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 7a08cfa5410407..1be565ef73ccc0 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -28,8 +28,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -122,8 +124,18 @@ static Status LowerToXlaGpuRuntime(
   mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
   pm.enableVerifier(should_verify);
 
+  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types;
+  for (int command_type_num : debug_options.xla_gpu_enable_command_buffer()) {
+    if (!DebugOptions::CommandBufferCmdType_IsValid(command_type_num)) {
+      return InternalError("Invalid command buffer command type");
+    }
+    DebugOptions::CommandBufferCmdType command_type =
+        static_cast<DebugOptions::CommandBufferCmdType>(command_type_num);
+    command_types.insert(command_type);
+  }
+
   GpuPipelineOpts opts;
-  opts.gpu_graph_level = debug_options.xla_gpu_graph_level();
+  opts.command_types = command_types;
   opts.min_graph_size = debug_options.xla_gpu_graph_min_graph_size();
   opts.enable_concurrent_region =
       debug_options.xla_gpu_graph_enable_concurrent_region();
@@ -198,9 +210,12 @@ StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
                             module_str);
   }
 
+  // Collect allocation indices for handling graph capture functions.
+  auto allocation_indices = GetAllocationIndices(mlir_module);
+
   return std::make_unique<GpuRuntimeProgram>(
       entry_function_name.str(), std::move(module_str), buffer_sizes.vec(),
-      module_config.debug_options());
+      std::move(allocation_indices), module_config.debug_options());
 }
 
 StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
@@ -352,8 +367,12 @@ Status CompileModuleToLlvmIrImpl(
 
   absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
       operation_map;
+
+  // Store the allocations in the order of the LMHLO buffer arguments.
+  std::vector<const BufferAllocation*> ordered_allocations;
   TF_RETURN_IF_ERROR(HloToLhloModule(*results->buffer_assignment, *hlo_module,
-                                     *mlir_module, &operation_map));
+                                     *mlir_module, &ordered_allocations,
+                                     &operation_map));
 
   results->module_name =
       mlir::mhlo::GetDebugNameFromLocation(mlir_module->getLoc());
@@ -365,26 +384,39 @@ Status CompileModuleToLlvmIrImpl(
   auto entry_function = mlir::cast<mlir::func::FuncOp>(
       mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
 
-  // TODO(b/304613751): Add this flag to xla flags.
-  constexpr bool emit_ir_from_hlo = false;
+  bool emit_from_hlo = !IsXlaRuntimeExecutableEnabled(hlo_module->config());
+
+  std::vector<BufferAllocation> mlir_allocations;
+  absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> mlir_output_info;
+  Shape mlir_output_shape;
+  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(
+      entry_function, &mlir_allocations, &mlir_output_info, &mlir_output_shape,
+      &results->entry_func_attrs));
 
   IrEmitterContext ir_emitter_context(
-      hlo_module, emit_ir_from_hlo ? results->buffer_assignment.get() : nullptr,
-      platform_name, gpu_device_info, mlir_context.get(),
-      results->llvm_module.get(), emit_ir_from_hlo);
+      hlo_module, results->buffer_assignment.get(), platform_name,
+      gpu_device_info, mlir_context.get(), results->llvm_module.get(),
+      emit_from_hlo);
 
-  if (emit_ir_from_hlo) {
-    TF_RET_CHECK(!IsXlaRuntimeExecutableEnabled(hlo_module->config()));
-    results->allocations = results->buffer_assignment->Allocations();
+  std::vector<BufferAllocation*> allocations;
+  if (emit_from_hlo) {
     results->output_shape = hlo_module->result_shape();
     TF_ASSIGN_OR_RETURN(
         results->output_info,
         GetOutputInfo(*hlo_module, *results->buffer_assignment));
+    TF_RET_CHECK(mlir_allocations.size() == ordered_allocations.size());
+    ir_emitter_context.set_allocations(ordered_allocations);
+    results->use_original_allocations = true;
   } else {
-    TF_RETURN_IF_ERROR(GetMlirAllocationInfo(
-        entry_function, &results->allocations, &results->output_info,
-        &results->output_shape, &results->entry_func_attrs));
-    ir_emitter_context.set_allocations(results->allocations);
+    results->allocations = std::move(mlir_allocations);
+    results->output_shape = mlir_output_shape;
+    results->output_info = mlir_output_info;
+    allocations.reserve(results->allocations.size());
+    for (auto& allocation : results->allocations) {
+      allocations.push_back(&allocation);
+    }
+    ir_emitter_context.set_allocations(allocations);
+    results->use_original_allocations = false;
   }
 
   auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
@@ -415,16 +447,16 @@ Status CompileModuleToLlvmIrImpl(
     RecordHloToLlvmDuration(end_usecs - start_usecs);
   }
 
-  // Sizes of all buffers required for running XLA module.
-  std::vector<int64_t> buffer_sizes;
-  llvm::transform(
-      results->allocations, std::back_inserter(buffer_sizes),
-      [](const BufferAllocation& allocation) { return allocation.size(); });
-
   // TODO(ezhulenev): Remove the FP8 check once https://reviews.llvm.org/D140088
   // is submitted. Currently we can't emit LLVM IR with fp8 types.
   if (IsXlaRuntimeExecutableEnabled(hlo_module->config()) &&
       !HasFp8(*hlo_module)) {
+    // Sizes of all buffers required for running XLA module.
+    std::vector<int64_t> buffer_sizes;
+    llvm::transform(
+        results->allocations, std::back_inserter(buffer_sizes),
+        [](const BufferAllocation& allocation) { return allocation.size(); });
+
     TF_ASSIGN_OR_RETURN(
         results->executable,
         LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index a670553a79fc37..39a2a048442c4b 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -48,6 +48,11 @@ struct CompileModuleResults {
   absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> output_info;
   Shape output_shape;
   std::string module_name;
+
+  // If true, the compiled module uses buffer allocations owned by
+  // buffer_assignment. Otherwise the compiled module uses buffer allocations
+  // stored in allocations.
+  bool use_original_allocations;
 };
 
 // Removes all globals from the given module that are both uninitialized and
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 627bee57f2d222..ee767e2b6c68f7 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -191,6 +191,7 @@ StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
           config.output_descriptor,
           /* output_data = */ DeviceMemoryBase(nullptr), config.conv_desc,
           use_fallback, nullptr, numeric_options, &runners));
+
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -230,8 +231,9 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
       params.config->input_descriptor, params.input_buf,
       params.config->filter_descriptor, params.filter_buf,
       params.config->output_descriptor, params.output_buf,
-      params.config->conv_desc, /* use_fallback = */ false, scratch_allocator,
-      numeric_options, &runners));
+      params.config->conv_desc,
+      /* use_fallback = */ false, scratch_allocator, numeric_options,
+      &runners));
 
   return runners;
 }
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
index d70e3e7cce3f7e..2f655beab10381 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
+++ b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
@@ -14,36 +14,42 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/service/gpu/cub_sort_kernel.h"
-#include "xla/service/gpu/gpu_prim.h"
 
 #include <cstddef>
 #include <cstdint>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#if GOOGLE_CUDA
+#include "xla/service/gpu/gpu_prim_cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/gpu_prim_rocm.h"
+#endif  // TENSORFLOW_USE_ROCM
 
 namespace xla {
 namespace gpu {
 namespace {
 
 #if GOOGLE_CUDA
-#define CHK_GPU_ERR(err) if(err != cudaSuccess) { \
-      return absl::InvalidArgumentError(  \
+#define CHK_GPU_ERR(err)                                       \
+  if (err != cudaSuccess) {                                    \
+    return absl::InvalidArgumentError(                         \
         absl::StrCat("CUB error: ", cudaGetErrorString(err))); \
-    }
+  }
 #elif TENSORFLOW_USE_ROCM
-#define CHK_GPU_ERR(err) if(err != hipSuccess) { \
-      return absl::InvalidArgumentError(  \
+#define CHK_GPU_ERR(err)                                         \
+  if (err != hipSuccess) {                                       \
+    return absl::InvalidArgumentError(                           \
         absl::StrCat("HIPCUB error: ", hipGetErrorString(err))); \
-    }
-#endif        
-
+  }
+#endif
 
 template <typename KeyT>
 absl::Status CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
                          const void* d_keys_in, void* d_keys_out,
                          size_t num_items, bool descending) {
-  auto err = descending
+  auto err =
+      descending
           ? gpuprim::DeviceRadixSort::SortKeysDescending<KeyT>(
                 d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
                 static_cast<KeyT*>(d_keys_out), num_items)
@@ -59,7 +65,8 @@ absl::Status CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
                           const void* d_keys_in, void* d_keys_out,
                           const void* d_values_in, void* d_values_out,
                           size_t num_items, bool descending) {
-  auto err = descending
+  auto err =
+      descending
           ? gpuprim::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
                 d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
                 static_cast<KeyT*>(d_keys_out),
diff --git a/third_party/xla/xla/service/gpu/cub_sort_thunk.cc b/third_party/xla/xla/service/gpu/cub_sort_thunk.cc
index 733aeff1b63f90..4bc986f045d956 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_thunk.cc
+++ b/third_party/xla/xla/service/gpu/cub_sort_thunk.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/cub_sort_thunk.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -29,8 +30,11 @@ limitations under the License.
 #include "xla/service/gpu/cub_sort_kernel.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -51,6 +55,7 @@ class CubSortKeysImpl : public CubSortRunnerInterface {
              bool descending) override;
   Status Run(const Thunk::ExecuteParams& params,
              const CubSortThunk* thunk) override;
+  StatusOr<int64_t> GetScratchSize(int64_t num_items) override;
 
  private:
   SortKeysFn sort_keys_fn_;
@@ -78,6 +83,13 @@ Status CubSortKeysImpl::Run(const Thunk::ExecuteParams& params,
              allocs.GetDeviceAddress(thunk->scratch()), thunk->descending());
 }
 
+StatusOr<int64_t> CubSortKeysImpl::GetScratchSize(int64_t num_items) {
+  size_t temp_bytes = 0;
+  TF_RETURN_IF_ERROR(
+      sort_keys_fn_(nullptr, temp_bytes, nullptr, nullptr, num_items, false));
+  return temp_bytes;
+}
+
 // Template class for sorting a pair of tensors.
 class CubSortPairsImpl : public CubSortRunnerInterface {
  public:
@@ -93,6 +105,7 @@ class CubSortPairsImpl : public CubSortRunnerInterface {
              bool descending) override;
   Status Run(const Thunk::ExecuteParams& params,
              const CubSortThunk* thunk) override;
+  StatusOr<int64_t> GetScratchSize(int64_t num_items) override;
 
  private:
   SortPairsFn sort_pairs_fn_;
@@ -121,7 +134,14 @@ Status CubSortPairsImpl::Run(const Thunk::ExecuteParams& params,
              allocs.GetDeviceAddress(thunk->scratch()), thunk->descending());
 }
 
-std::unique_ptr<CubSortRunnerInterface> CreateCubSortRunner(
+StatusOr<int64_t> CubSortPairsImpl::GetScratchSize(int64_t num_items) {
+  size_t temp_bytes = 0;
+  TF_RETURN_IF_ERROR(sort_pairs_fn_(nullptr, temp_bytes, nullptr, nullptr,
+                                    nullptr, nullptr, num_items, false));
+  return temp_bytes;
+}
+
+StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateCubSortRunner(
     PrimitiveType type) {
   switch (type) {
     case F16:
@@ -147,18 +167,20 @@ std::unique_ptr<CubSortRunnerInterface> CreateCubSortRunner(
     case U64:
       return std::make_unique<CubSortKeysImpl>(CubSortKeys_u64, U64);
     default:
-      CHECK(false) << "Unsupported type of the sort kernel: "
-                   << primitive_util::LowercasePrimitiveTypeName(type);
+      return InvalidArgument("Unsupported type of the sort kernel: %s",
+                             primitive_util::LowercasePrimitiveTypeName(type));
   }
 }
 
-std::unique_ptr<CubSortRunnerInterface> CreateCubSortRunner(
+StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateCubSortRunner(
     PrimitiveType key_type, PrimitiveType value_type) {
   // Values can be of any type of 16/32/64 bit width.
   int valueWidth = primitive_util::BitWidth(value_type);
-  CHECK(valueWidth == 16 || valueWidth == 32 || valueWidth == 64)
-      << "Unsupported value type of the sort kernel: "
-      << primitive_util::LowercasePrimitiveTypeName(value_type);
+  if (valueWidth != 16 && valueWidth != 32 && valueWidth != 64) {
+    return InvalidArgument(
+        "Unsupported value type of the sort kernel: %s",
+        primitive_util::LowercasePrimitiveTypeName(value_type));
+  }
 
   // Only unsigned integer types could be used for keys.
   switch (key_type) {
@@ -187,26 +209,28 @@ std::unique_ptr<CubSortRunnerInterface> CreateCubSortRunner(
       }
       return std::make_unique<CubSortPairsImpl>(CubSortPairs_u64_b64, U64);
     default:
-      CHECK(false) << "Unsupported key type of the sort kernel: "
-                   << primitive_util::LowercasePrimitiveTypeName(key_type);
+      return InvalidArgument(
+          "Unsupported key type of the sort kernel: %s",
+          primitive_util::LowercasePrimitiveTypeName(key_type));
   }
 }
 
-std::unique_ptr<CubSortRunnerInterface> CreateCubSortRunner(
-    PrimitiveType type, std::optional<PrimitiveType> value_type) {
+}  // namespace
+
+StatusOr<std::unique_ptr<CubSortRunnerInterface>>
+CubSortRunnerInterface::Create(PrimitiveType type,
+                               std::optional<PrimitiveType> value_type) {
   return value_type.has_value() ? CreateCubSortRunner(type, *value_type)
                                 : CreateCubSortRunner(type);
 }
 
-}  // namespace
-
 CubSortThunk::CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
                            std::optional<PrimitiveType> value_type,
                            std::vector<BufferAllocation::Slice> operands,
                            std::vector<BufferAllocation::Slice> results,
                            BufferAllocation::Slice scratch, bool descending)
     : Thunk(Thunk::kCubSort, thunk_info),
-      runner_(CreateCubSortRunner(type, value_type)),
+      runner_(CubSortRunnerInterface::Create(type, value_type).value()),
       operands_(std::move(operands)),
       results_(std::move(results)),
       scratch_(scratch),
@@ -218,7 +242,7 @@ Status RunCubSort(PrimitiveType type, std::optional<PrimitiveType> value_type,
                   se::DeviceMemoryBase output_keys,
                   se::DeviceMemoryBase output_values,
                   se::DeviceMemoryBase scratch, bool descending) {
-  auto runner = CreateCubSortRunner(type, value_type);
+  auto runner = CubSortRunnerInterface::Create(type, value_type).value();
   return runner->Run(input_keys, input_values, output_keys, output_values,
                      scratch, descending);
 }
diff --git a/third_party/xla/xla/service/gpu/cub_sort_thunk.h b/third_party/xla/xla/service/gpu/cub_sort_thunk.h
index 80ae1d4f650aa5..d79fb0e3892d05 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_thunk.h
+++ b/third_party/xla/xla/service/gpu/cub_sort_thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_CUB_SORT_THUNK_H_
 #define XLA_SERVICE_GPU_CUB_SORT_THUNK_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <vector>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 
@@ -39,6 +41,10 @@ class CubSortRunnerInterface {
                      se::DeviceMemoryBase scratch, bool descending) = 0;
   virtual Status Run(const Thunk::ExecuteParams& params,
                      const class CubSortThunk* thunk) = 0;
+  virtual StatusOr<int64_t> GetScratchSize(int64_t num_items) = 0;
+
+  static StatusOr<std::unique_ptr<CubSortRunnerInterface>> Create(
+      PrimitiveType type, std::optional<PrimitiveType> value_type);
 };
 
 class CubSortThunk : public Thunk {
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index ba7c9517a62226..099e6776a370d3 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -1315,19 +1315,14 @@ bool IsDbiasOnlyUserBesidesGradGemm(HloInstruction* d_intermediate,
       dbias_user = user;
     }
   }
-  HloInstruction* reduce;
   auto ConsumeExtraConvert = [](HloInstruction** instr) {
     Match((*instr)->users()[0], m::Convert(instr, m::Op()).WithOneUse());
     return true;
   };
   // user_count == 1 && (reduce-> {convert} ->bitcast)
   return user_count == 1 &&
-         Match(dbias_user, m::Reduce(&reduce, m::Op(), m::Op()).WithOneUse()) &&
-         ConsumeExtraConvert(&reduce) &&
-         Match(reduce->users()[0],
-               m::AnyOf<HloInstruction>(m::Reshape(dbias, m::Op()),
-                                        m::Bitcast(dbias, m::Op()))
-                   .WithOneUse());
+         Match(dbias_user, m::Reduce(dbias, m::Op(), m::Op()).WithOneUse()) &&
+         (*dbias)->shape().rank() == 3 && ConsumeExtraConvert(dbias);
 }
 
 StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
@@ -1459,12 +1454,24 @@ StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   output_shapes.push_back(ShapeUtil::MakeShape(U8, {0}));
 
   HloInstruction* dbias = nullptr;
-  if (d_intermediate &&
-      IsDbiasOnlyUserBesidesGradGemm(d_intermediate, bmm_1_grad_1, bmm_1_grad_2,
-                                     &dbias)) {
-    output_shapes.push_back(dbias->shape());
+  if (d_intermediate) {
+    if (IsDbiasOnlyUserBesidesGradGemm(d_intermediate, bmm_1_grad_1,
+                                       bmm_1_grad_2, &dbias)) {
+      // Cudnn kernel only outputs dbias in this shape [1, num_heads, seq, seq],
+      // so we add a dimension of 1 to existing dbias' shape.
+      std::vector<int64_t> dbias_shape_vector =
+          SpanToVector(dbias->shape().dimensions());
+      dbias_shape_vector.insert(dbias_shape_vector.begin(), 1);
+      Shape cudnn_dbias_shape = ShapeUtil::MakeShape(
+          dbias->shape().element_type(), dbias_shape_vector);
+      output_shapes.push_back(cudnn_dbias_shape);
+    } else {
+      VLOG(2) << "Intermediate gradient has other users outside of gradient "
+                 "gemms and dbias"
+              << " which is not supported by CUDNN for now. Skipping.";
+      return false;
+    }
   }
-
   Shape call_shape = ShapeUtil::MakeTupleShape(output_shapes);
   HloInstruction* fmha_bwd_call =
       comp->AddInstruction(HloInstruction::CreateCustomCall(
@@ -1485,12 +1492,23 @@ StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
       bmm_2_grad_1, HloInstruction::CreateGetTupleElement(bmm_2_grad_1->shape(),
                                                           fmha_bwd_call, 2)));
-  // d_intermediate tensor
+
   if (dbias) {
-    // does not really need d_intermediate
-    TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
-        dbias, HloInstruction::CreateGetTupleElement(dbias->shape(),
-                                                     fmha_bwd_call, 5)));
+    // Reshape fmha dbias output to original user's input shape.
+    // If the reshape doesn't involve physical data movement,
+    // algebraic simplifer can change it to a no-op bitcast.
+    Shape original_shape = dbias->shape();
+    HloInstruction* dbias_user = dbias->users()[0];
+    HloInstruction* cudnn_dbias_output =
+        comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+            output_shapes.back(), fmha_bwd_call, 5));
+    HloInstruction* reshape_dbias = comp->AddInstruction(
+        HloInstruction::CreateReshape(original_shape, cudnn_dbias_output));
+    TF_RETURN_IF_ERROR(dbias_user->ReplaceOperandWith(
+        dbias_user->operand_index(dbias), reshape_dbias));
+
+    TF_RETURN_IF_ERROR(
+        comp->ReplaceInstructionWithDifferentShape(dbias, cudnn_dbias_output));
   }
   return true;
 }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index 06ba3ef80b0144..1545b4a0e39e3f 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -1721,8 +1721,10 @@ ENTRY main.146 {
                   m::Transpose(m::Transpose(m::GetTupleElement(
                                    m::CustomCall({backward_target}), 2)))
                       .WithShape(BF16, {16, 256, 16, 64}),
-                  m::GetTupleElement(  // dbias
-                      m::CustomCall({backward_target}), dbias_index)),
+                  m::Reshape(
+                      m::Reshape(m::GetTupleElement(  // dbias
+                          m::CustomCall({backward_target}), dbias_index)))
+                      .WithShape(BF16, {1, 16, 256, 256})),
               0)),
           m::Op(), m::Op(), m::Op(), m::Op())));
   TF_ASSERT_OK_AND_ASSIGN(auto config,
@@ -1909,8 +1911,10 @@ ENTRY main.146 {
                   m::Transpose(m::Transpose(m::GetTupleElement(
                                    m::CustomCall({backward_target}), 2)))
                       .WithShape(F16, {16, 256, 16, 64}),
-                  m::GetTupleElement(  // dbias
-                      m::CustomCall({backward_target}), dbias_index)),
+                  m::Reshape(
+                      m::Reshape(m::GetTupleElement(  // dbias
+                          m::CustomCall({backward_target}), dbias_index)))
+                      .WithShape(F16, {1, 16, 256, 256})),
               0)),
           m::Op(), m::Op(), m::Op(), m::Op())));
   TF_ASSERT_OK_AND_ASSIGN(auto config,
@@ -2695,6 +2699,200 @@ ENTRY main.82 {
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
 }
 
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       BF16TrainingBmm1ScaleBiasSoftmaxDropoutBmm2DbiasShouldHaveUserShape) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->(bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0})}
+
+region_0.54 {
+  Arg_0.55 = bf16[] parameter(0)
+  Arg_1.56 = bf16[] parameter(1)
+  ROOT maximum.57 = bf16[] maximum(Arg_0.55, Arg_1.56)
+}
+
+region_1.66 {
+  Arg_0.67 = f32[] parameter(0)
+  Arg_1.68 = f32[] parameter(1)
+  ROOT add.69 = f32[] add(Arg_0.67, Arg_1.68)
+}
+
+region_2.114 {
+  Arg_0.115 = bf16[] parameter(0)
+  Arg_1.116 = bf16[] parameter(1)
+  ROOT add.117 = bf16[] add(Arg_0.115, Arg_1.116)
+}
+
+ENTRY main.146 {
+  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
+  copy = bf16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
+  transpose.5 = bf16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
+  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
+  copy.1 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
+  transpose = bf16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
+  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
+  copy.2 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
+  transpose.1 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
+  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
+  convert.35 = s32[16,1,256,256]{3,2,1,0} convert(Arg_4.5)
+  constant.28 = s32[] constant(0)
+  broadcast.29 = s32[16,1,256,256]{3,2,1,0} broadcast(constant.28), dimensions={}
+  compare.36 = pred[16,1,256,256]{3,2,1,0} compare(convert.35, broadcast.29), direction=GT
+  constant.30 = bf16[] constant(0)
+  broadcast.1 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.30), dimensions={}
+  constant.10 = bf16[] constant(-9.999e+09)
+  broadcast.3 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
+  select.39 = bf16[16,1,256,256]{3,2,1,0} select(compare.36, broadcast.1, broadcast.3)
+  reshape.41 = bf16[16,256,256]{2,1,0} reshape(select.39)
+  broadcast.42 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.41), dimensions={0,2,3}
+  Arg_3.4 = bf16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
+  reshape.44 = bf16[16,256,256]{2,1,0} reshape(Arg_3.4)
+  broadcast.45 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.44), dimensions={1,2,3}
+  add.46 = bf16[16,16,256,256]{3,2,1,0} add(broadcast.42, broadcast.45)
+  add.53 = bf16[16,16,256,256]{3,2,1,0} add(dot, add.46)
+  constant.31 = bf16[] constant(-inf)
+  reduce.58 = bf16[16,16,256]{2,1,0} reduce(add.53, constant.31), dimensions={3}, to_apply=region_0.54
+  broadcast.62 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.58), dimensions={0,1,2}
+  subtract.63 = bf16[16,16,256,256]{3,2,1,0} subtract(add.53, broadcast.62)
+  exponential.64 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.63)
+  convert.65 = f32[16,16,256,256]{3,2,1,0} convert(exponential.64)
+  constant.11 = f32[] constant(0)
+  reduce.70 = f32[16,16,256]{2,1,0} reduce(convert.65, constant.11), dimensions={3}, to_apply=region_1.66
+  convert.4 = bf16[16,16,256]{2,1,0} convert(reduce.70)
+  broadcast.75 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.4), dimensions={0,1,2}
+  divide.76 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.64, broadcast.75)
+  constant.22 = u32[1]{0} constant({255383827})
+  constant.21 = u32[1]{0} constant({267815257})
+  constant.2 = u32[1]{0} constant({0})
+  constant.23 = u32[1]{0} constant({3213575472})
+  custom-call.49 = (u32[1]{0}, u32[1]{0}) custom-call(constant.22, constant.21, constant.2, constant.23), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
+  get-tuple-element.50 = u32[1]{0} get-tuple-element(custom-call.49), index=0
+  reshape.80 = u32[] reshape(get-tuple-element.50)
+  broadcast.84 = u32[32768]{0} broadcast(reshape.80), dimensions={}
+  get-tuple-element.51 = u32[1]{0} get-tuple-element(custom-call.49), index=1
+  reshape.81 = u32[] reshape(get-tuple-element.51)
+  broadcast.85 = u32[32768]{0} broadcast(reshape.81), dimensions={}
+  iota.79 = u32[65536]{0} iota(), iota_dimension=0
+  slice.82 = u32[32768]{0} slice(iota.79), slice={[0:32768]}
+  slice.83 = u32[32768]{0} slice(iota.79), slice={[32768:65536]}
+  custom-call.86 = (u32[32768]{0}, u32[32768]{0}) custom-call(broadcast.84, broadcast.85, slice.82, slice.83), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[32768]{0}, u32[32768]{0}, u32[32768]{0}, u32[32768]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\000\000\000\000\000\000"
+  get-tuple-element.87 = u32[32768]{0} get-tuple-element(custom-call.86), index=0
+  get-tuple-element.88 = u32[32768]{0} get-tuple-element(custom-call.86), index=1
+  concatenate.89 = u32[65536]{0} concatenate(get-tuple-element.87, get-tuple-element.88), dimensions={0}
+  constant.17 = u32[] constant(9)
+  broadcast.13 = u32[65536]{0} broadcast(constant.17), dimensions={}
+  shift-right-logical.0 = u32[65536]{0} shift-right-logical(concatenate.89, broadcast.13)
+  constant.15 = u32[] constant(1065353216)
+  broadcast.21 = u32[65536]{0} broadcast(constant.15), dimensions={}
+  or.0 = u32[65536]{0} or(shift-right-logical.0, broadcast.21)
+  bitcast-convert.0 = f32[65536]{0} bitcast-convert(or.0)
+  constant.3 = f32[] constant(-1)
+  broadcast.30 = f32[65536]{0} broadcast(constant.3), dimensions={}
+  add.1 = f32[65536]{0} add(bitcast-convert.0, broadcast.30)
+  broadcast.31 = f32[65536]{0} broadcast(constant.11), dimensions={}
+  maximum.0 = f32[65536]{0} maximum(add.1, broadcast.31)
+  constant.9 = f32[] constant(0.9)
+  broadcast.32 = f32[65536]{0} broadcast(constant.9), dimensions={}
+  compare.0 = pred[65536]{0} compare(maximum.0, broadcast.32), direction=LT
+  constant = bf16[] constant(1.109)
+  broadcast.33 = bf16[65536]{0} broadcast(constant), dimensions={}
+  broadcast.34 = bf16[65536]{0} broadcast(constant.30), dimensions={}
+  select.2 = bf16[65536]{0} select(compare.0, broadcast.33, broadcast.34)
+  reshape.39 = bf16[16,16,256]{2,1,0} reshape(select.2)
+  broadcast.9 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.39), dimensions={0,1,3}
+  multiply.101 = bf16[16,16,256,256]{3,2,1,0} multiply(divide.76, broadcast.9)
+  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.5, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  transpose.103 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
+  Arg_5.6 = bf16[16,256,16,64]{3,2,1,0} parameter(5), sharding={replicated}
+  copy.3 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_5.6), sharding={replicated}
+  transpose.4 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
+  dot.2 = bf16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  multiply.108 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.2, broadcast.9)
+  divide.124 = bf16[16,16,256,256]{3,2,1,0} divide(multiply.108, broadcast.75)
+  constant.19 = bf16[] constant(1)
+  broadcast.24 = bf16[16,16,256]{2,1,0} broadcast(constant.19), dimensions={}
+  multiply.2 = bf16[16,16,256]{2,1,0} multiply(convert.4, convert.4)
+  divide.0 = bf16[16,16,256]{2,1,0} divide(broadcast.24, multiply.2)
+  broadcast.111 = bf16[16,16,256,256]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
+  multiply.112 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.108, broadcast.111)
+  multiply.113 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.112, exponential.64)
+  reduce.118 = bf16[16,16,256]{2,1,0} reduce(multiply.113, constant.30), dimensions={3}, to_apply=region_2.114
+  negate.1 = bf16[16,16,256]{2,1,0} negate(reduce.118)
+  broadcast.11 = bf16[16,16,256,256]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
+  add.133 = bf16[16,16,256,256]{3,2,1,0} add(divide.124, broadcast.11)
+  multiply.134 = bf16[16,16,256,256]{3,2,1,0} multiply(add.133, exponential.64)
+  copy.4 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
+  transpose.9 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
+  dot.4 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.144 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
+  dot.3 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.142 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
+  copy.5 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_5.6), sharding={replicated}
+  transpose.104 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
+  dot.106 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.104, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.107 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.106), dimensions={0,3,1,2}
+  reduce.139 = bf16[16,256,256]{2,1,0} reduce(multiply.134, constant.30), dimensions={0}, to_apply=region_2.114
+  bitcast.111 = bf16[1,16,256,256]{3,2,1,0} bitcast(reduce.139)
+  all-reduce = bf16[1,16,256,256]{3,2,1,0} all-reduce(bitcast.111), channel_id=85, replica_groups={{0}}, to_apply=region_2.114
+  tuple.145 = (bf16[16,256,16,64]{1,3,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{1,3,2,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(transpose.103, transpose.144, transpose.142, transpose.107, all-reduce)
+  get-tuple-element = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=0
+  copy.6 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
+  get-tuple-element.1 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=1
+  copy.7 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
+  get-tuple-element.2 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=2
+  copy.8 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
+  get-tuple-element.3 = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=3
+  copy.9 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
+  get-tuple-element.4 = bf16[1,16,256,256]{3,2,1,0} get-tuple-element(tuple.145), index=4
+  ROOT tuple = (bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9, get-tuple-element.4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  const HloInstruction* fmha;
+  const absl::string_view backward_target =
+      kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
+  auto dbias_index = 5;
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Copy(m::GetTupleElement(
+              m::Tuple(
+                  m::Transpose().WithShape(BF16, {16, 256, 16, 64}),
+                  m::Transpose(m::GetTupleElement(
+                                   m::CustomCall(&fmha, {backward_target}), 0))
+                      .WithShape(BF16, {16, 256, 16, 64}),
+                  m::Transpose(
+                      m::GetTupleElement(m::CustomCall({backward_target}), 1))
+                      .WithShape(BF16, {16, 256, 16, 64}),
+                  m::Transpose(m::Transpose(m::GetTupleElement(
+                                   m::CustomCall({backward_target}), 2)))
+                      .WithShape(BF16, {16, 256, 16, 64}),
+                  m::AllReduce(m::Bitcast(
+                      m::Reshape(
+                          m::GetTupleElement(  // dbias
+                              m::CustomCall({backward_target}), dbias_index))
+                          .WithShape(BF16, {16, 256, 256})))),
+              0)),
+          m::Op(), m::Op(), m::Op(), m::Op())));
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          fmha->backend_config<CudnnfMHABackendConfig>());
+  EXPECT_EQ(fmha->operands().size(), 5);
+  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
+}
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusion_merger.cc b/third_party/xla/xla/service/gpu/fusion_merger.cc
index 56e4ee5f2202b4..3e44bc4c84d7d8 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger.cc
+++ b/third_party/xla/xla/service/gpu/fusion_merger.cc
@@ -275,7 +275,9 @@ FusionDecision FusionInstructionMerger::ShouldFuse(HloInstruction* producer) {
   }
 
   GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
-      producer, &*cost_analysis_, producer->users());
+      producer, &*cost_analysis_,
+      GpuPerformanceModelOptions::ForModule(producer->GetModule()),
+      producer->users());
   if (t.time_fused > t.time_unfused) {
     ++num_fail_slower_if_fused_;
     return "will execute slower if fused";
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 7f9de9b742e9ba..063d0e712b02a2 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -53,7 +54,8 @@ bool IsSingleInstructionFusion(mlir::lmhlo::FusionOp fusion) {
 }  // namespace
 
 std::optional<std::unique_ptr<FusionInterface>> GetFusionEmitter(
-    HloFusionAnalysis& analysis, absl::Span<const BufferAllocation> allocations,
+    HloFusionAnalysis& analysis,
+    absl::Span<const BufferAllocation* const> allocations,
     mlir::lmhlo::FusionOp fusion_op) {
   switch (analysis.GetEmitterFusionKind()) {
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.h b/third_party/xla/xla/service/gpu/fusions/fusions.h
index 1899fea07caa60..82fc63400b411c 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.h
@@ -32,7 +32,8 @@ namespace gpu {
 // `allocations` may be empty and `fusion_op` may be nullptr if buffer
 // assignment didn't run yet.
 std::optional<std::unique_ptr<FusionInterface>> GetFusionEmitter(
-    HloFusionAnalysis& analysis, absl::Span<const BufferAllocation> allocations,
+    HloFusionAnalysis& analysis,
+    absl::Span<const BufferAllocation* const> allocations,
     mlir::lmhlo::FusionOp fusion_op);
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 1deae4df66df6e..20f9e709f0961a 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_algorithm_picker.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
 #include <limits>
 #include <optional>
@@ -263,6 +264,16 @@ StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
       AutotunerUtil::CreateBuffer(buffer_allocator, output_shape,
                                   autotune_config, rng_state));
 
+  int64_t workspace_size =
+      autotune_config.GetCudaComputeCapability().IsAtLeastHopper()
+          ? GemmConfig::kHopperWorkspace
+          : GemmConfig::kDefaultWorkspace;
+  TF_ASSIGN_OR_RETURN(
+      se::DeviceMemoryBase workspace_buffer,
+      AutotunerUtil::CreateBuffer(buffer_allocator,
+                                  ShapeUtil::MakeShape(S8, {workspace_size}),
+                                  autotune_config, rng_state));
+
   HloModuleConfig& hlo_module_config = gemm->GetModule()->mutable_config();
   AutotuneResult best_algorithm;
   if (IsCublasLtMatmul(*gemm)) {
@@ -342,8 +353,9 @@ StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
               // success-ness is returned in
               // ProfileResult::is_valid.
               TF_RETURN_IF_ERROR(RunGemm(config, lhs_buffer, rhs_buffer,
-                                         output_buffer, deterministic_ops,
-                                         stream, algorithm, &profile_result));
+                                         output_buffer, workspace_buffer,
+                                         deterministic_ops, stream, algorithm,
+                                         &profile_result));
               return std::move(profile_result);
             }));
     if (best_algorithm.has_gemm()) {
@@ -362,6 +374,15 @@ StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                                 const AutotuneConfig& config) {
   VLOG(3) << "Loading the autotune result of GemmThunk " << gemm->ToString();
 
+  GemmBackendConfig gemm_config =
+      gemm->backend_config<GemmBackendConfig>().value();
+  // Degenerate gemms replaced with memzero operation, no need to auto tune it.
+  if (gemm_config.alpha_real() == 0.0 && gemm_config.alpha_imag() == 0.0 &&
+      gemm_config.beta() == 0.0) {
+    VLOG(3) << "Skip degenerate gemm instruction auto tuning";
+    return false;
+  }
+
   AutotuneCacheKey key(config.GetModelStr(), *gemm);
 
   TF_ASSIGN_OR_RETURN(AutotuneResult algorithm,
@@ -370,8 +391,6 @@ StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                       }));
 
   se::CudaComputeCapability capability = config.GetCudaComputeCapability();
-  GemmBackendConfig gemm_config =
-      gemm->backend_config<GemmBackendConfig>().value();
   GemmBackendConfig updated_config = gemm_config;
 
   // We only set the 'algorithm' field on non-Ampere architectures, as for
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
index 6fc7472a40ca6d..07a901fd781226 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
@@ -93,7 +93,7 @@ ENTRY main {
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall(&dot)));
+              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
 
   TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
                           dot->backend_config<GemmBackendConfig>());
@@ -163,7 +163,7 @@ ENTRY main {
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall(&dot)));
+              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
 
   TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
                           dot->backend_config<GemmBackendConfig>());
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 6913e04cf8df2d..186a9a44064464 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -11,13 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-==============================================================================*/
+=
+=============================================================================*/
 
 #include "xla/service/gpu/gemm_rewriter.h"
 
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -42,6 +44,9 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/blas.h"
@@ -487,6 +492,22 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         instr->dot_dimension_numbers();
     *gemm_backend_config.mutable_precision_config() = instr->precision_config();
 
+    HloInstruction *lhs = instr->mutable_operand(0);
+    HloInstruction *rhs = instr->mutable_operand(1);
+    auto attributes = instr->frontend_attributes().map();
+    gemm_backend_config.set_grad_x(attributes["grad_x"] == "true");
+    gemm_backend_config.set_grad_y(attributes["grad_y"] == "true");
+
+    int64_t lhs_batch_dims_size =
+        instr->dot_dimension_numbers().lhs_batch_dimensions_size();
+    int64_t lhs_stride = lhs->shape().dimensions(lhs_batch_dims_size) *
+                         lhs->shape().dimensions(lhs_batch_dims_size + 1);
+    int64_t rhs_stride = rhs->shape().dimensions(lhs_batch_dims_size) *
+                         rhs->shape().dimensions(lhs_batch_dims_size + 1);
+
+    gemm_backend_config.set_lhs_stride(lhs_stride);
+    gemm_backend_config.set_rhs_stride(rhs_stride);
+
     // First try to match the fp8 gemm pattern.
     TF_ASSIGN_OR_RETURN(bool supported_by_cublaslt,
                         GemmIsSupportedByCublasLt(*instr, gemm_backend_config));
@@ -1799,7 +1820,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
             dot_dims.rhs_contracting_dimensions(),
             /*output_shape=*/instr.shape(), gemm_backend_config.alpha_real(),
             gemm_backend_config.alpha_imag(), gemm_backend_config.beta(),
-            /*algorithm*/ std::nullopt, se::blas::kDefaultComputePrecision));
+            /*algorithm*/ std::nullopt, se::blas::kDefaultComputePrecision,
+            gemm_backend_config.grad_x(), gemm_backend_config.grad_y()));
 
     if (matrix_name == "lhs" || matrix_name == "a") {
       return gemm_config.lhs_layout.order == MatrixLayout::Order::kColumnMajor;
@@ -1938,10 +1960,77 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   }
 };
 
+// Rewriter that adds a workspace to legacy cuBLAS custom calls. We run it
+// separately after gemm rewriter, so that we can do pattern matching without
+// having to match output tuples.
+class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit GemmWorkspaceRewriteVisitor(se::GpuComputeCapability gpu_version)
+      : gpu_version_(gpu_version) {}
+
+  Status HandleCustomCall(HloInstruction *instr) override {
+    if (instr->custom_call_target() != kGemmCallTarget ||
+        !instr->shape().IsArray()) {
+      return OkStatus();
+    }
+
+    auto *cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version_);
+
+    // Pass a user-managed workspace to legacy cuBLAS operations, as
+    // otherwise cuBLAS will use its own internal pool which will be competing
+    // with XLA allocator for device memory.
+    int64_t workspace = cuda_cc == nullptr ? 0
+                        : cuda_cc->IsAtLeastHopper()
+                            ? GemmConfig::kHopperWorkspace
+                            : GemmConfig::kDefaultWorkspace;
+
+    // Do not allocate workspace larger than the output size.
+    // TODO(ezhulenev): This is not based on any measurement, just a common
+    // sense, we should tweak it to find the minimal workspace size.
+    workspace = std::min(workspace, ShapeUtil::ByteSizeOf(instr->shape()));
+
+    // If CUDA graphs are disabled (command buffer implementation detail),
+    // then we reset the workspace size to 0 and rely on cuBlas to allocate
+    // workspace from its own pool.
+    //
+    // TODO(ezhulenev): Remove this work around, allocating workspace
+    // explicitly should always be better than relying on cuBlas.
+    bool cuda_graphs_disabled = instr->GetModule()
+                                    ->config()
+                                    .debug_options()
+                                    .xla_gpu_enable_command_buffer_size() == 0;
+    if (cuda_graphs_disabled) workspace = 0;
+
+    // Append workspace buffer to instruction outputs.
+    std::vector<Shape> output_shapes = {instr->shape()};
+    output_shapes.emplace_back(ShapeUtil::MakeShape(S8, {workspace}));
+    Shape output_shape = ShapeUtil::MakeTupleShape(output_shapes);
+
+    // Clone custom call with a new shape.
+    HloInstruction *new_call = instr->AddInstruction(
+        instr->CloneWithNewOperands(output_shape, instr->operands()));
+
+    // Update operand aliasing if it was a fused gemm with aliased output.
+    auto *custom_call = xla::Cast<HloCustomCallInstruction>(new_call);
+    if (!custom_call->output_to_operand_aliasing().empty()) {
+      custom_call->set_output_to_operand_aliasing({{{0}, {2, {}}}});
+    }
+
+    HloInstruction *get_output = instr->AddInstruction(
+        HloInstruction::CreateGetTupleElement(new_call, 0));
+    return ReplaceInstruction(instr, get_output);
+  }
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+};
+
 StatusOr<bool> RunOnComputation(HloComputation *computation,
                                 se::GpuComputeCapability gpu_version) {
   GemmRewriterVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  GemmWorkspaceRewriteVisitor workspace_visitor(gpu_version);
+  TF_RETURN_IF_ERROR(computation->Accept(&workspace_visitor));
   return visitor.changed();
 }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_thunk.cc b/third_party/xla/xla/service/gpu/gemm_thunk.cc
index 0abdc694494556..b774280a2b3ec5 100644
--- a/third_party/xla/xla/service/gpu/gemm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/gemm_thunk.cc
@@ -41,10 +41,13 @@ GemmThunk::GemmThunk(ThunkInfo thunk_info, GemmConfig config,
 Status GemmThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Running GEMM thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
+  // TODO(ezhulenev): Pass a correct workspace. For now we ignore it as Thunks
+  // are disabled by default, and they do not interact with CUDA graphs.
+  se::DeviceMemoryBase workspace(nullptr, 0);
   return RunGemm(config_, allocs.GetDeviceAddress(lhs_buffer_),
                  allocs.GetDeviceAddress(rhs_buffer_),
-                 allocs.GetDeviceAddress(output_buffer_), deterministic_,
-                 params.stream);
+                 allocs.GetDeviceAddress(output_buffer_), workspace,
+                 deterministic_, params.stream);
 }
 
 Status GemmThunk::Initialize(se::StreamExecutor* executor,
diff --git a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
index 29f52ffc7da620..ce25a019ab85fc 100644
--- a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
+++ b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -68,6 +69,12 @@ StatusOr<bool> AllGatherOptimizer::Run(
         continue;
       }
 
+      if (!ShapeUtil::Equal(left_all_gather->operand(0)->shape(),
+                            right_all_gather->operand(0)->shape())) {
+        VLOG(2) << "all-gather operands have different shapes";
+        continue;
+      }
+
       if (right_all_gather->user_count() != 1 ||
           left_all_gather->user_count() != 1) {
         VLOG(2) << "all-gather user_count > 1 ";
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 7e639eb78d171f..f4756e9c1db67c 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -789,6 +789,9 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
       pipeline.AddPass<AllReduceContiguous>();
     }
 
+    TF_RETURN_IF_ERROR(
+        AddCustomKernelReplacementPasses(&pipeline, debug_options));
+
     int32_t blueconnect_num_devices_per_host =
         debug_options.xla_gpu_all_reduce_blueconnect_num_devices_per_host();
     if (blueconnect_num_devices_per_host > 0) {
@@ -1540,7 +1543,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                             thunk_sequence.ToString());
   }
 
-  std::shared_ptr<const BufferAssignment> buffer_assignment;
+  std::shared_ptr<BufferAssignment> buffer_assignment;
   std::unique_ptr<BufferAssignmentProto> buffer_assignment_proto;
   std::function<std::string()> buffer_assignment_dumper = [] {
     return std::string();
@@ -1557,6 +1560,21 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     };
   }
 
+  std::vector<BufferAllocation> allocations;
+  if (compile_module_results.use_original_allocations) {
+    if (!options.is_autotuning_compilation) {
+      std::vector<BufferAllocation> original_allocations =
+          buffer_assignment->ReleaseAllocations();
+      allocations = std::move(original_allocations);
+    } else {
+      std::vector<BufferAllocation> original_allocations =
+          compile_module_results.buffer_assignment->ReleaseAllocations();
+      allocations = std::move(original_allocations);
+    }
+  } else {
+    allocations = std::move(compile_module_results.allocations);
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto gpu_executable,
       GpuExecutable::Create(GpuExecutable::Params{
@@ -1572,7 +1590,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
           /*output_info=*/std::move(compile_module_results.output_info),
           /*module_name=*/std::move(compile_module_results.module_name),
           /*output_shape=*/std::move(compile_module_results.output_shape),
-          /*allocations=*/std::move(compile_module_results.allocations),
+          /*allocations=*/std::move(allocations),
           /*enable_persistent_temp_buffers=*/
           module->config()
               .debug_options()
@@ -1769,7 +1787,6 @@ Status GpuCompiler::RunPostSchedulingPipelines(
         HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kConstant,
                          HloOpcode::kBitcast, HloOpcode::kGetTupleElement>;
     pipeline.AddPass<GpuConvertAsyncCollectivesToSync>(is_nop);
-    pipeline.AddPass<OptimizationBarrierExpander>();
 
     TF_RETURN_IF_ERROR(pipeline.Run(module).status());
   }
@@ -1790,6 +1807,7 @@ Status GpuCompiler::RunPostSchedulingPipelines(
         /*host_memory_offload_config=*/std::nullopt);
     HloRematerialization::RematerializationSizes sizes;
     pipeline.AddPass<HloRematerialization>(options, sizes);
+    pipeline.AddPass<OptimizationBarrierExpander>();
 
     TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(module));
     if (changed) {
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 89079f8dd40fac..1f2c45c26c8553 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -34,11 +34,13 @@ limitations under the License.
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/llvm_compiler.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -186,6 +188,12 @@ class GpuCompiler : public LLVMCompiler {
     return OkStatus();
   }
 
+  // Add passes that convert HLO operations to custom kernels.
+  virtual Status AddCustomKernelReplacementPasses(
+      HloPassPipeline* pipeline, const DebugOptions& debug_options) {
+    return OkStatus();
+  }
+
  private:
   Status LoadAutotuneResultsFromFile(const DebugOptions& debug_options);
   Status SerializeAutotuneResultsToFile(const DebugOptions& debug_options);
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.h b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
index ff85fceb74cf6e..748db93fabab7e 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
@@ -200,8 +200,6 @@ struct RunConvOptions {
   GenericConvRunner* runner_cache;
 };
 
-// This file contains low-level routines for running cudnn convolutions.
-
 // Calls into cudnn to run the specified convolution.
 //
 // We provide one overload which takes a scratch buffer, and another which takes
diff --git a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
index dc059e40c53580..e3e6425b022bb6 100644
--- a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
@@ -201,13 +201,14 @@ fused_computation {
   param_1.1 = f32[2,3]{1,0} parameter(1)
   neg = f32[2,3]{1,0} negate(param_1.1)
   mul = f32[2,3]{1,0} multiply(param_0.1, neg)
-  ROOT tuple = (f32[2,3]{1,0}, f32[2,3]{1,0}) tuple(mul, neg)
+  transpose = f32[3,2]{1,0} transpose(neg), dimensions={1,0}
+  ROOT tuple = (f32[2,3]{1,0}, f32[2,3]{1,0}, f32[3,2]{1,0}) tuple(mul, neg, transpose)
 }
 
 ENTRY main {
   param_0 = f32[2,3]{1,0} parameter(0)
   param_1 = f32[2,3]{1,0} parameter(1)
-  ROOT fusion = (f32[2,3]{1,0}, f32[2,3]{1,0}) fusion(param_0, param_1), kind=kLoop, calls=fused_computation
+  ROOT fusion = (f32[2,3]{1,0}, f32[2,3]{1,0}, f32[3,2]{1,0}) fusion(param_0, param_1), kind=kLoop, calls=fused_computation
 }
 )";
 
@@ -216,7 +217,7 @@ ENTRY main {
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(0), {0}));
   // The second operand cannot share the buffer with the second fusion output,
-  // because the 'neg' op is also used on the path to the first fusion output.
+  // because the 'neg' op is also used by a non-elementwise op.
   ExpectOptionalFalse(
       FusionCanShareBufferHint(fusion, fusion->operand(1), {1}));
   // The first operand cannot share the buffer with the second fusion output,
@@ -225,6 +226,39 @@ ENTRY main {
       FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
 }
 
+TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedReductionEmitter) {
+  constexpr char kModuleString[] = R"(
+HloModule TestModule
+
+%maximum {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %res = f32[] maximum(%lhs, %rhs)
+}
+
+%fused_computation {
+  %lhs = f32[3,40] parameter(0)
+  %rhs = f32[3,40] parameter(1)
+  %add = f32[3,40] add(%lhs, %rhs)
+  %bc = f32[120] bitcast(%add)
+  %init = f32[] constant(-inf)
+  %max = f32[] reduce(%bc, %init), dimensions={0}, to_apply=%maximum
+  ROOT %result = (f32[], f32[3,40]) tuple(%max, %add)
+}
+
+ENTRY %main {
+  %lhs = f32[3,40] parameter(0)
+  %rhs = f32[3,40] parameter(1)
+  ROOT %fusion = (f32[], f32[3,40]) fusion(%lhs, %rhs),
+      kind=kLoop, calls=%fused_computation
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
+}
+
 TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedScatterFusion) {
   const char* const kModuleString = R"(
     HloModule fusion
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index a6aebab498a291..906e1810811ee9 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -1023,6 +1023,32 @@ static StatusOr<std::vector<int64_t>> GetBufferSizes(runtime::FunctionType& f) {
   return buffer_sizes;
 }
 
+// TODO(ezhulenev): This is a copy of `GetAllocationIndices` from
+// `mlir/backends/gpu/transforms/passes.h`. We can't depend on that file because
+// of a dependency cycle, and this is a short term work around the cuda graph
+// capture bug. This code should not survive beyond Q1 2024.
+static std::vector<std::vector<int64_t>> GetAllocationIndices(
+    mlir::ModuleOp module) {
+  std::vector<std::vector<int64_t>> res;
+
+  mlir::SymbolTable sym_table(module);
+  for (auto op : module.getOps<runtime::ExportOp>()) {
+    unsigned ordinal = *op.ordinal();
+    if (ordinal >= res.size()) res.resize(ordinal + 1);
+
+    auto func = sym_table.lookup<mlir::func::FuncOp>(op.getFunctionRef());
+    res[ordinal].resize(func.getNumArguments(), -1);
+
+    for (unsigned i = 0; i < func.getNumArguments(); ++i) {
+      auto idx =
+          func.getArgAttrOfType<mlir::IntegerAttr>(i, "rt.allocation_index");
+      if (idx) res[ordinal][i] = idx.getInt();
+    }
+  }
+
+  return res;
+}
+
 StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
     std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
     absl::string_view mlir_module,
@@ -1053,6 +1079,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   TF_ASSIGN_OR_RETURN(std::vector<int64_t> buffer_sizes,
                       GetBufferSizes(functions[0].signature));
 
+  // Get allocation indices from graph capture functions.
+  auto allocation_indices = GetAllocationIndices(*module);
+
   // Get the XLA module entrypoint function.
   auto func = mlir::cast<mlir::func::FuncOp>(module->lookupSymbol(entry));
 
@@ -1082,8 +1111,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   // Move runtime::Executable ownership to the GpuRuntimeExecutable.
   TF_ASSIGN_OR_RETURN(auto gpu_runtime_executable,
                       GpuRuntimeExecutable::Create(
-                          hlo_module->name(), buffer_sizes,
-                          std::move(*executable), std::move(debug_options)));
+                          hlo_module->name(), std::move(buffer_sizes),
+                          std::move(allocation_indices), std::move(*executable),
+                          std::move(debug_options)));
 
   // Construct GpuExecutable for the loaded XLA Runtime executable.
   std::string name = hlo_module->name();
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
index 43748b1b272e98..74c3d839750a5a 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
@@ -82,7 +82,9 @@ Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
                                      params.config->mask,
                                      params.config->activation,
                                      dropout_rate,
-                                     seed};
+                                     seed,
+                                     false,
+                                     false};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   return (*runner)(stream, options.profile_result, scratch_memory,
@@ -237,29 +239,37 @@ Status RunFusedMHABackward(GpufMHABackwardParams params, se::Stream *stream,
   if (params.config->seed) {
     seed = *params.config->seed;
   }
-
-  se::dnn::FusedMHABackwardOp::Config config{kind,
-                                             scale,
-                                             params.config->bmm1_grad_gemm1_rhs,
-                                             params.config->bmm1_grad_gemm2_rhs,
-                                             params.config->bmm2_grad_gemm1_lhs,
-                                             params.config->bmm2_grad_gemm2_rhs,
-                                             params.config->d_output,
-                                             params.config->d_bmm1_lhs,
-                                             params.config->d_bmm1_rhs,
-                                             params.config->d_bmm2_rhs,
-                                             params.config->d_s,
-                                             params.config->mask,
-                                             params.config->d_bias,
-                                             dropout_rate,
-                                             seed};
+  // TODO: set is_flash_attention to real value, set it to false for now
+  se::dnn::FusedMHABackwardOp::Config config{
+      kind,
+      scale,
+      params.config->bmm1_grad_gemm1_rhs,
+      params.config->bmm1_grad_gemm2_rhs,
+      params.config->bmm2_grad_gemm1_lhs,
+      params.config->bmm2_grad_gemm2_rhs,
+      params.config->d_output,
+      params.config->d_bmm1_lhs,
+      params.config->d_bmm1_rhs,
+      params.config->d_bmm2_rhs,
+      std::optional<TensorDescriptor>(params.config->d_s),
+      params.config->mask,
+      params.config->d_bias,
+      std::nullopt,
+      std::nullopt,
+      dropout_rate,
+      seed,
+      false,
+      false};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
+  // TODO: pass in real softmax_sum, dQ_accum, fwd_output
   return (*runner)(stream, options.profile_result, scratch_memory,
                    bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
                    bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
                    d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-                   d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer);
+                   d_bmm2_rhs_buffer, d_s_buffer, se::DeviceMemoryBase(),
+                   se::DeviceMemoryBase(), mask_buffer, d_bias_buffer,
+                   se::DeviceMemoryBase(), se::DeviceMemoryBase());
 }
 
 template <typename ElementType, typename BiasType, typename OutputType>
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index b658c94bc99bcf..744dd1ce1d4bed 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -106,6 +106,28 @@ bool IsPhysicallyTransposing(const HloInstruction& instr) {
                                          instr.shape(), instr.dimensions()));
 }
 
+bool TransposesMinorDimension(const HloInstruction* instr) {
+  switch (instr->opcode()) {
+    case HloOpcode::kFusion:
+      return absl::c_any_of(instr->fused_instructions(),
+                            TransposesMinorDimension);
+    case HloOpcode::kCopy:
+      return instr->shape().layout().minor_to_major(0) !=
+             instr->operand(0)->shape().layout().minor_to_major(0);
+    case HloOpcode::kTranspose: {
+      // We have an input ([a,b,c]{x,y,z}) that's being transposed. We need to
+      // check if the minor-most dimension (x) is still the minor-most dimension
+      // after the transpose.
+      int64_t minor_input =
+          instr->operand(0)->shape().layout().minor_to_major(0);
+      int64_t minor_output = instr->shape().layout().minor_to_major(0);
+      return minor_input != instr->dimensions().at(minor_output);
+    }
+    default:
+      return false;
+  }
+}
+
 bool IsReduceInputFusion(const HloInstruction& instr) {
   return instr.opcode() == HloOpcode::kFusion &&
          absl::c_any_of(GetFusionRoots(*instr.called_computations()[0]),
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 29b9e5387c9a15..897b44bb301fc1 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -70,10 +70,25 @@ int64_t ReductionProjectedShmemUsageBytes(
 
 inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 64; }
 
-// Whether the op tranposes the physical data layout. Fusing such ops may lead
+// Whether the op transposes the physical data layout. Fusing such ops may lead
 // to uncoalesced data access and may thus not be beneficial.
 bool IsPhysicallyTransposing(const HloInstruction& instr);
 
+// Whether the op transposes the minor-most dimension. In the case of fusions,
+// whether the fusion contains some op that does this.
+// If the minor-most dimension is transposed, this results in uncoalesced memory
+// accesses in untiled code generators. If some other dimension is transposed,
+// this just results in additional index computations.
+// Note that this function makes several simplifying assumptions:
+// - For non-fusion instructions, we assume the output is materialized as is.
+//   For internal instructions, this may not be the case.
+// - For fusions, it simply checks the output of this function for each
+//   instruction in the fusion's computation.
+// - There's no way to tell which parameters of the fusion are transposed.
+// TODO(jreiffers): Take into account the size of the transposed dimension as
+// well.
+bool TransposesMinorDimension(const HloInstruction* instr);
+
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
 // reduction-to-vector ops. Other reduction ops are lowered by
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index bbdd48633909ee..8766416cf032b1 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -210,6 +210,56 @@ TEST_F(GpuFusibleTest,
   EXPECT_FALSE(IsPhysicallyTransposing(*loop_fusion));
 }
 
+TEST_F(GpuFusibleTest, TransposesMinorDimension) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY entry {
+      default_layout = f32[10,20,30,40]{3,2,1,0} parameter(0)
+      non_default_layout = f32[10,20,30,40]{1,2,3,0} parameter(1)
+
+      transpose_minor_default = f32[10,20,40,30]{3,2,1,0} transpose(default_layout), dimensions={0,1,3,2}
+      no_transpose_minor_default = f32[10,20,40,30]{2,3,1,0} transpose(default_layout), dimensions={0,1,3,2}
+      transpose_major_default = f32[10,30,20,40]{3,2,1,0} transpose(default_layout), dimensions={0,2,1,3}
+
+      transpose_minor_non_default = f32[10,30,20,40]{1,2,3,0} transpose(non_default_layout), dimensions={0,2,1,3}
+      no_transpose_minor_non_default = f32[10,20,40,30]{1,2,0,3} transpose(non_default_layout), dimensions={0,1,3,2}
+      transpose_major_non_default = f32[10,20,40,30]{1,2,3,0} transpose(non_default_layout), dimensions={0,1,3,2}
+
+      ROOT r = tuple(transpose_minor_default, no_transpose_minor_default, transpose_major_default,
+                     transpose_minor_non_default, no_transpose_minor_non_default, transpose_major_non_default)
+    })"));
+
+  auto* tuple = (*module)->entry_computation()->root_instruction();
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(0)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(1)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(2)));
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(3)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(4)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(5)));
+}
+
+TEST_F(GpuFusibleTest, CopyTransposesMinorDimension) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY entry {
+      default_layout = f32[10,20,30,40]{3,2,1,0} parameter(0)
+      non_default_layout = f32[10,20,30,40]{1,2,3,0} parameter(1)
+
+      copy_transpose_minor_default = f32[10,20,30,40]{2,3,1,0} copy(default_layout)
+      copy_no_transpose_minor_default = f32[10,20,30,40]{3,2,1,0} copy(default_layout)
+
+      copy_transpose_minor_non_default = f32[10,20,30,40]{2,1,3,0} copy(non_default_layout)
+      copy_no_transpose_minor_non_default = f32[10,20,30,40]{1,2,3,0} copy(non_default_layout)
+
+      ROOT r = tuple(copy_transpose_minor_default, copy_no_transpose_minor_default,
+                     copy_transpose_minor_non_default, copy_no_transpose_minor_non_default)
+    })"));
+
+  auto* tuple = (*module)->entry_computation()->root_instruction();
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(0)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(1)));
+  EXPECT_TRUE(TransposesMinorDimension(tuple->operand(2)));
+  EXPECT_FALSE(TransposesMinorDimension(tuple->operand(3)));
+}
+
 TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index dd01ed3bfdce96..baeccacbd1069a 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <memory>
 #include <optional>
@@ -22,14 +24,25 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/buffer_value.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/model/analytical_latency_estimator.h"
@@ -38,7 +51,12 @@ limitations under the License.
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/service/p2p_schedule_preparation.h"
 #include "xla/service/profile_guided_latency_estimator.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/protobuf.h"
@@ -600,6 +618,9 @@ int64_t GetSizeOfShape(const Shape& shape, int pointer_size) {
 Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
                          int64_t memory_limit,
                          const se::DeviceDescription& gpu_device_info) {
+  if (module->has_schedule()) {
+    return OkStatus();
+  }
   HloPassPipeline prepare_pipeline("p2p-schedule-preparation");
   prepare_pipeline.AddPass<P2PSchedulePreparation>();
   TF_RETURN_IF_ERROR(prepare_pipeline.Run(module).status());
@@ -734,7 +755,10 @@ int64_t GetSchedulerMemoryLimit(const HloModule* module,
         total_io_size -= GetSizeOfShape(subshape, pointer_size);
       });
 
-  return (base_limit - total_io_size) * 95 / 100;
+  int64_t limit =
+      (base_limit - total_io_size) *
+      module->config().debug_options().xla_gpu_memory_limit_slop_factor() / 100;
+  return limit;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 8fc62dbebc5fd0..e952c0c9d8b07e 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -16,20 +16,34 @@ limitations under the License.
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/backend.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_ordering.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
 
 namespace xla {
@@ -891,6 +905,38 @@ while_body {
             get_index("recv.1", while_body));
 }
 
+TEST_F(GpuHloScheduleTest, SkipAlreadyScheduled) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[1024,1024]{1,0} parameter(0)
+  ROOT exponential.1 = f32[1024,1024]{1,0} exponential(param_0)
+}
+
+fused_computation.1 {
+  param_0.1 = f32[1024,1024]{1,0} parameter(0)
+  ROOT negate.1 = f32[1024,1024]{1,0} negate(param_0.1)
+}
+
+ENTRY e {
+  p = f32[1024,1024]{1,0} parameter(0)
+  wrapped_negate = f32[1024,1024]{1,0} fusion(p), kind=kLoop, calls=fused_computation.1
+  wrapped_exponential = f32[1024,1024]{1,0} fusion(p), kind=kLoop, calls=fused_computation
+  ROOT t = (f32[1024,1024]{1,0}, f32[1024,1024]{1,0}) tuple(wrapped_exponential, wrapped_negate)
+})")
+                    .value();
+  TF_CHECK_OK(ScheduleGpuModule(
+      module.get(), /*pointer_size=*/8,
+      /*memory_limit=*/1024 * 1024 * 1024,
+      backend().default_stream_executor()->GetDeviceDescription()));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+// CHECK: ENTRY
+// CHECK: wrapped_negate = f32[1024,1024]{1,0}
+// CHECK: wrapped_exponential = f32[1024,1024]{1,0}
+)"));
+}
+
 class GpuHloScheduleParameterizedTest
     : public GpuHloScheduleTest,
       public ::testing::WithParamInterface<bool> {};
diff --git a/third_party/xla/xla/service/gpu/gpu_prim_cuda.h b/third_party/xla/xla/service/gpu/gpu_prim_cuda.h
new file mode 100644
index 00000000000000..e4ee313cacc950
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_prim_cuda.h
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+To in writing unless required by applicable law or agreed,
+distributed on an, software distributed under the license is "AS IS"
+BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
+or implied. For the specific language governing permissions and
+limitations under the license, the license you must see.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
+#define XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
+
+#include "tsl/platform/bfloat16.h"
+
+#if GOOGLE_CUDA
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_scan.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
+#include "third_party/gpus/cuda/include/cusparse.h"
+
+namespace gpuprim = ::cub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace cub {
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>(
+    Eigen::half *ptr, Eigen::half val, Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+template <>
+__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer<Eigen::half>(
+    Eigen::half *ptr, Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<Eigen::half>(result);
+}
+
+template <>
+__device__ __forceinline__ void ThreadStoreVolatilePtr<tsl::bfloat16>(
+    tsl::bfloat16 *ptr, tsl::bfloat16 val, Int2Type<true> /*is_primitive*/) {
+  *reinterpret_cast<volatile uint16_t *>(ptr) =
+      Eigen::numext::bit_cast<uint16_t>(val);
+}
+
+template <>
+__device__ __forceinline__ tsl::bfloat16
+ThreadLoadVolatilePointer<tsl::bfloat16>(tsl::bfloat16 *ptr,
+                                         Int2Type<true> /*is_primitive*/) {
+  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  return Eigen::numext::bit_cast<tsl::bfloat16>(result);
+}
+
+template <>
+struct NumericTraits<Eigen::half>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/Eigen::half> {};
+template <>
+struct NumericTraits<tsl::bfloat16>
+    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
+                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
+                 /*T=*/tsl::bfloat16> {};
+}  // namespace cub
+#endif  // GOOGLE_CUDA
+
+#endif  // XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_prim_rocm.h b/third_party/xla/xla/service/gpu/gpu_prim_rocm.h
new file mode 100644
index 00000000000000..a9d8715894f8c1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_prim_rocm.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+To in writing unless required by applicable law or agreed,
+distributed on an, software distributed under the license is "AS IS"
+BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
+or implied. For the specific language governing permissions and
+limitations under the license, the license you must see.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
+#define XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
+
+#include "tsl/platform/bfloat16.h"
+
+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hipcub/hipcub.hpp"
+#include "rocm/rocm_config.h"
+namespace gpuprim = ::hipcub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace rocprim {
+namespace detail {
+
+#if (TF_ROCM_VERSION >= 50200)
+template <>
+struct float_bit_mask<Eigen::half> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7C00;
+  static constexpr uint16_t mantissa = 0x03FF;
+  using bit_type = uint16_t;
+};
+
+template <>
+struct float_bit_mask<tsl::bfloat16> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7F80;
+  static constexpr uint16_t mantissa = 0x007F;
+  using bit_type = uint16_t;
+};
+#endif
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
+template <>
+struct radix_key_codec_base<tsl::bfloat16>
+    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
+};  // namespace detail
+};  // namespace rocprim
+
+#endif  // TENSORFLOW_USE_ROCM
+#endif  // XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
new file mode 100644
index 00000000000000..77d182ccc49a6a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
@@ -0,0 +1,238 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_sort_rewriter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/cub_sort_thunk.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// Analyze sort comparer function.
+struct SortComputationAnalysis {
+  int key_operand;  // 0 or 1
+  bool descending;
+};
+
+std::optional<SortComputationAnalysis> AnalyzeSortComputation(
+    const HloComputation* computation) {
+  // Root instruction must be a comparison with a valid direction.
+  const HloCompareInstruction* compare =
+      DynCast<HloCompareInstruction>(computation->root_instruction());
+  if (compare == nullptr || compare->direction() == ComparisonDirection::kEq ||
+      compare->direction() == ComparisonDirection::kNe) {
+    return std::nullopt;
+  }
+
+  // Compare should operate on the function parameters for a single tensor.
+  const HloParameterInstruction* param0 =
+      DynCast<HloParameterInstruction>(compare->operand(0));
+  const HloParameterInstruction* param1 =
+      DynCast<HloParameterInstruction>(compare->operand(1));
+  if (param0 == nullptr || param1 == nullptr) {
+    return std::nullopt;
+  }
+
+  // When sorting a pair of tensors, the parameters should be adjacent.
+  int index0 = param0->parameter_number();
+  int index1 = param1->parameter_number();
+  int first_index = std::min(index0, index1);
+  if (first_index % 2 != 0 || std::max(index0, index1) != first_index + 1) {
+    return std::nullopt;
+  }
+
+  // Return the tensor index and the sort direction.
+  bool descending = compare->direction() == ComparisonDirection::kGt ||
+                    compare->direction() == ComparisonDirection::kGe;
+  bool reverse = first_index != index0;
+  return SortComputationAnalysis{first_index / 2, descending != reverse};
+}
+
+// Create runner for CUB sort operation.
+StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateRunner(
+    HloSortInstruction* sort_op, const SortComputationAnalysis& sort_config) {
+  int value_index = 1 - sort_config.key_operand;
+  return CubSortRunnerInterface::Create(
+      sort_op->operand(sort_config.key_operand)->shape().element_type(),
+      sort_op->operand_count() == 2
+          ? std::optional(sort_op->operand(value_index)->shape().element_type())
+          : std::nullopt);
+}
+
+// Verify that the sort tensor shape is supported by CUB.
+bool IsCubCompatibleSort(HloSortInstruction* sort_op) {
+  VLOG(1) << "Sort instruction: " << sort_op->name();
+  if (sort_op->operand_count() != 1 && sort_op->operand_count() != 2) {
+    VLOG(2) << "Unsupported operand count: " << sort_op->operand_count();
+    return false;
+  }
+  if (sort_op->operand(0)->shape().rank() != 1) {
+    VLOG(2) << "Only 1D shapes are supported";
+    return false;
+  }
+  if (sort_op->operand(0)->shape().dimensions(0) <
+      GpuSortRewriter::kSortSizeThreshold) {
+    VLOG(2) << "Tensor shape size is too small to see an improvement";
+    return false;
+  }
+
+  auto sort_config =
+      AnalyzeSortComputation(sort_op->called_computations().front());
+  if (!sort_config.has_value()) {
+    VLOG(2) << "Only simple compare computations are supported";
+    return false;
+  }
+  if (!CreateRunner(sort_op, *sort_config).ok()) {
+    VLOG(2) << "Unsupported operand types (no compiled CUB kernels)";
+    return false;
+  }
+  VLOG(2) << "Sort operation is compatible";
+  return true;
+}
+
+// Restore the result shape after sorting a pair of tensors.
+// The trailing argument is the scratch buffer which should be discarded.
+HloInstruction* UnpackResultPair(HloSortInstruction* sort_op,
+                                 HloInstruction* custom_call, bool swap) {
+  HloComputation* parent = sort_op->parent();
+  HloInstruction* gte0 =
+      parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+          sort_op->operand(0)->shape(), custom_call, swap ? 1 : 0));
+  HloInstruction* gte1 =
+      parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+          sort_op->operand(1)->shape(), custom_call, swap ? 0 : 1));
+  return parent->AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+}
+
+}  // namespace
+
+// Rewrites a single sort instruction with a custom call.
+StatusOr<bool> GpuSortRewriter::RunOnInstruction(HloSortInstruction* sort_op) {
+  // Get the sort tensor index and direction.
+  SortComputationAnalysis sort_config =
+      AnalyzeSortComputation(sort_op->called_computations().front()).value();
+
+  // Get scratch size requirements from CUB.
+  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_op, sort_config));
+  TF_ASSIGN_OR_RETURN(
+      int64_t scratch_size,
+      runner->GetScratchSize(sort_op->operand(0)->shape().dimensions(0)));
+
+  // Values are only present if sorting a pair of tensors.
+  HloInstruction* keys = sort_op->mutable_operand(0);
+  HloInstruction* values = nullptr;
+  if (sort_op->operand_count() == 2) {
+    values = sort_op->mutable_operand(1);
+    if (sort_config.key_operand == 1) {
+      std::swap(keys, values);
+    }
+  }
+
+  // Build the resulting shape for the custom call.
+  std::vector<Shape> shapes{keys->shape()};
+  std::vector<HloInstruction*> operands{keys};
+  if (values != nullptr) {
+    shapes.push_back(values->shape());
+    operands.push_back(values);
+  }
+  shapes.push_back(ShapeUtil::MakeShape(U8, {scratch_size}));
+  Shape call_shape = ShapeUtil::MakeTupleShape(absl::MakeSpan(shapes));
+
+  // Build the custom call instruction.
+  HloInstruction* custom_call =
+      sort_op->parent()->AddInstruction(HloInstruction::CreateCustomCall(
+          call_shape, absl::MakeSpan(operands), kCubDeviceRadixSortTarget));
+
+  xla::SortOptions backend_config;
+  backend_config.set_descending(sort_config.descending);
+  TF_RETURN_IF_ERROR(custom_call->set_backend_config(backend_config));
+
+  // Build the replacement instruction.
+  HloInstruction* replacement;
+  if (sort_op->operand_count() == 1) {
+    replacement =
+        sort_op->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+            sort_op->shape(), custom_call, 0));
+  } else {
+    replacement = UnpackResultPair(sort_op, custom_call,
+                                   /*swap=*/sort_config.key_operand == 1);
+  }
+
+  // Replace sort operation with custom call followed by GTE.
+  TF_RETURN_IF_ERROR(
+      sort_op->parent()->ReplaceInstruction(sort_op, replacement));
+  return true;
+}
+
+// Rewrites the sorts in the given computation into calls to CUB.
+StatusOr<bool> GpuSortRewriter::RunOnComputation(HloComputation* computation) {
+  std::vector<HloSortInstruction*> sort_ops;
+  for (auto* inst : computation->instructions()) {
+    HloSortInstruction* sort = DynCast<HloSortInstruction>(inst);
+    if (sort != nullptr && IsCubCompatibleSort(sort)) {
+      sort_ops.push_back(sort);
+    }
+  }
+  bool changed = false;
+  for (auto* sort : sort_ops) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(sort));
+    changed |= result;
+  }
+  return changed;
+}
+
+// Replace compatible sort operations with custom calls.
+StatusOr<bool> GpuSortRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(2, "GpuSortRewriter::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  XLA_VLOG_LINES(2, "GpuSortRewriter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.h b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.h
new file mode 100644
index 00000000000000..094cbc48bd089e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_SORT_REWRITER_H_
+#define XLA_SERVICE_GPU_GPU_SORT_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites sort operations into CustomCall HLOs that call into CUB.
+// Only a subset of shapes is supported - either a single tensor with a simple
+// compare function or a pair of tensors where keys are unsigned integers.
+
+class GpuSortRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "gpu-sort-rewriter"; }
+
+  // CUB radix sort is slower than XLA sort on small shapes, so do not rewrite
+  // tensors with sizes below this limit.
+  static constexpr int kSortSizeThreshold = 100000;
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  StatusOr<bool> RunOnInstruction(HloSortInstruction* sort_op);
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_SORT_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc
new file mode 100644
index 00000000000000..80540770cadf7c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter_test.cc
@@ -0,0 +1,314 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_sort_rewriter.h"
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/statusor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace m = ::xla::match;
+
+class GpuSortRewriterTest : public HloTestBase {
+ public:
+  bool RunPass(HloModule* module) {
+    return GpuSortRewriter().Run(module).value();
+  }
+
+  void ExpectDirection(const HloInstruction* instruction, bool descending) {
+    auto config = instruction->backend_config<xla::SortOptions>();
+    EXPECT_EQ(config->descending(), descending);
+  }
+};
+
+// Basic sort: ascending.
+TEST_F(GpuSortRewriterTest, SortKeysLessThan) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[100000] parameter(0)
+  ROOT %sort = f32[100000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+  ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
+                  /*descending=*/false);
+}
+
+// Basic sort: descending.
+TEST_F(GpuSortRewriterTest, SortKeysGreaterThan) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %gt = pred[] compare(%lhs, %rhs), direction=GT
+}
+
+ENTRY %main {
+  %input = f32[100000] parameter(0)
+  ROOT %sort = f32[100000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+  ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
+                  /*descending=*/true);
+}
+
+// Comparer swaps the parameter order -> direction is reversed.
+TEST_F(GpuSortRewriterTest, SortKeysGreaterThanSwapped) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(1)
+  %rhs = f32[] parameter(0)
+  ROOT %gt = pred[] compare(%lhs, %rhs), direction=GT
+}
+
+ENTRY %main {
+  %input = f32[100000] parameter(0)
+  ROOT %sort = f32[100000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+  ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
+                  /*descending=*/false);
+}
+
+// Sort a pair of tensors, keys go first.
+TEST_F(GpuSortRewriterTest, SortPairs) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs_key = u32[] parameter(0)
+  %rhs_key = u32[] parameter(1)
+  %lhs_value = f32[] parameter(2)
+  %rhs_value = f32[] parameter(3)
+  ROOT %lt = pred[] compare(%lhs_key, %rhs_key), direction=LT
+}
+
+ENTRY %main {
+  %input_keys = u32[100000] parameter(0)
+  %input_values = f32[100000] parameter(1)
+  ROOT %sort = (u32[100000], f32[100000]) sort(%input_keys, %input_values),
+      dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::GetTupleElement(m::CustomCall(), 0),
+                                  m::GetTupleElement(m::CustomCall(), 1))));
+}
+
+// Sort a pair of tensors, keys go last.
+TEST_F(GpuSortRewriterTest, SortPairsSwapped) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(1)
+  %lhs_key = u32[] parameter(2)
+  %rhs_key = u32[] parameter(3)
+  ROOT %lt = pred[] compare(%lhs_key, %rhs_key), direction=LT
+}
+
+ENTRY %main {
+  %input_values = f32[100000] parameter(0)
+  %input_keys = u32[100000] parameter(1)
+  ROOT %sort = (f32[100000], u32[100000]) sort(%input_values, %input_keys),
+      dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::GetTupleElement(m::CustomCall(), 1),
+                                  m::GetTupleElement(m::CustomCall(), 0))));
+}
+
+// CUB sort doesn't support more than two tensors.
+TEST_F(GpuSortRewriterTest, NoRewriteManyTensors) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  %unused1 = f64[] parameter(2)
+  %unused2 = f64[] parameter(3)
+  %unused3 = u64[] parameter(4)
+  %unused4 = u64[] parameter(5)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input1 = f32[100000] parameter(0)
+  %input2 = f64[100000] parameter(1)
+  %input3 = u64[100000] parameter(2)
+  ROOT %sort = (f32[100000], f64[100000], u64[100000]) sort(%input1, %input2, %input3),
+      dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+// Only 1D shapes are supported.
+TEST_F(GpuSortRewriterTest, NoRewriteManyDimensions) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[100000,4] parameter(0)
+  ROOT %sort = f32[100000,4] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+// Kernels are compiled for a subset of types.
+TEST_F(GpuSortRewriterTest, NoRewriteUnsupportedType) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = pred[] parameter(0)
+  %rhs = pred[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = pred[100000] parameter(0)
+  ROOT %sort = pred[100000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+// Comparer must be a simple function.
+TEST_F(GpuSortRewriterTest, NoRewriteComplexComparer) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %lhs_scaled = f32[] multiply(%lhs, f32[] constant(2))
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs_scaled, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[100000] parameter(0)
+  ROOT %sort = f32[100000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+// Comparer must use adjacent input values.
+TEST_F(GpuSortRewriterTest, NoRewriteMixedKeysValues) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs_key = u32[] parameter(0)
+  %rhs_key = u32[] parameter(1)
+  %lhs_value = u32[] parameter(2)
+  %rhs_value = u32[] parameter(3)
+  ROOT %mixed = pred[] compare(%rhs_key, %lhs_value), direction=LT
+}
+
+ENTRY %main {
+  %input_keys = u32[100000] parameter(0)
+  %input_values = u32[100000] parameter(1)
+  ROOT %sort = (u32[100000], u32[100000]) sort(%input_keys, %input_values),
+      dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+// Small shapes do not see improvement from CUB sort.
+TEST_F(GpuSortRewriterTest, NoRewriteSmallSize) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[1000] parameter(0)
+  ROOT %sort = f32[1000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunPass(module.get()));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc
similarity index 87%
rename from third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.cc
rename to third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc
index f856b015b027c2..4b3d4f1fcbe569 100644
--- a/third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/cublas_lt_matmul_thunk.h"
+#include "xla/service/gpu/gpublas_lt_matmul_thunk.h"
 
 #include <memory>
 #include <utility>
@@ -55,11 +55,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
 
 Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
-  if (!algorithm_) {
-    TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
-    TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
-    algorithm_ = algorithms[algorithm_idx_];
-  }
+  TF_ASSIGN_OR_RETURN(auto algorithm, GetMatmulAlgorithm(plan));
 
   VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
@@ -95,7 +91,7 @@ Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
       params.stream, allocs.GetDeviceAddress(a_buffer_),
       allocs.GetDeviceAddress(b_buffer_), allocs.GetDeviceAddress(c_buffer_),
       allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale, c_scale,
-      d_scale, d_amax, *algorithm_, scratch_allocator);
+      d_scale, d_amax, *algorithm, scratch_allocator);
 }
 
 StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
@@ -110,5 +106,19 @@ StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
   return it->second.get();
 }
 
+StatusOr<std::optional<se::gpu::BlasLt::MatmulAlgorithm> >
+CublasLtMatmulThunk::GetMatmulAlgorithm(
+    const se::gpu::BlasLt::MatmulPlan* plan) {
+  absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
+  auto it = matmul_algorithm_cache_.find(plan);
+  if (it == matmul_algorithm_cache_.end()) {
+    TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
+    TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
+    auto algorithm = algorithms[algorithm_idx_];
+    it = matmul_algorithm_cache_.emplace(plan, algorithm).first;
+  }
+  return it->second;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h
similarity index 84%
rename from third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.h
rename to third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h
index 7e6a1c6d2fe563..e89181504f9b24 100644
--- a/third_party/xla/xla/service/gpu/cublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_CUBLAS_LT_MATMUL_THUNK_H_
-#define XLA_SERVICE_GPU_CUBLAS_LT_MATMUL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
+#define XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -54,12 +54,19 @@ class CublasLtMatmulThunk : public Thunk {
  private:
   StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
       const stream_executor::Stream* stream);
+  StatusOr<std::optional<se::gpu::BlasLt::MatmulAlgorithm> > GetMatmulAlgorithm(
+      const se::gpu::BlasLt::MatmulPlan* plan);
 
   absl::Mutex matmul_plans_cache_mutex_;
   absl::flat_hash_map<const stream_executor::Stream*,
                       se::gpu::BlasLt::MatmulPlanPtr>
       matmul_plans_cache_ ABSL_GUARDED_BY(matmul_plans_cache_mutex_);
 
+  absl::Mutex matmul_algorithm_cache_mutex_;
+  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
+                      se::gpu::BlasLt::MatmulAlgorithm>
+      matmul_algorithm_cache_ ABSL_GUARDED_BY(matmul_algorithm_cache_mutex_);
+
   GemmConfig gemm_config_;
   se::gpu::BlasLt::Epilogue epilogue_;
   int64_t algorithm_idx_;
@@ -74,10 +81,9 @@ class CublasLtMatmulThunk : public Thunk {
   BufferAllocation::Slice c_scale_buffer_;
   BufferAllocation::Slice d_scale_buffer_;
   BufferAllocation::Slice d_amax_buffer_;
-  std::optional<se::gpu::BlasLt::MatmulAlgorithm> algorithm_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_CUBLAS_LT_MATMUL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index e1745011ca653e..a28628c145d52f 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -388,7 +388,7 @@ static int64_t GetAllocationIndex(mlir::BlockArgument func_arg,
 }
 
 StatusOr<BufferAllocation::Slice> GetAllocationSlice(
-    mlir::Value v, absl::Span<const BufferAllocation> allocations,
+    mlir::Value v, absl::Span<const BufferAllocation* const> allocations,
     std::string* constant_name) {
   if (constant_name) {
     constant_name->clear();
@@ -414,9 +414,10 @@ StatusOr<BufferAllocation::Slice> GetAllocationSlice(
           mlir::dyn_cast_or_null<mlir::memref::ViewOp>(v.getDefiningOp())) {
     TF_RET_CHECK(view.getSource().isa<mlir::BlockArgument>());
 
+    const BufferAllocation* allocation = allocations[GetAllocationIndex(
+        view.getSource().cast<mlir::BlockArgument>(), constant_name)];
     return BufferAllocation::Slice(
-        &allocations[GetAllocationIndex(
-            view.getSource().cast<mlir::BlockArgument>(), constant_name)],
+        allocation,
         mlir::cast<mlir::arith::ConstantOp>(view.getByteShift().getDefiningOp())
             .getValue()
             .cast<mlir::IntegerAttr>()
@@ -434,12 +435,13 @@ StatusOr<BufferAllocation::Slice> GetAllocationSlice(
         module.lookupSymbol(get_global.getName()));
     int64_t index =
         global->getAttrOfType<mlir::IntegerAttr>("lmhlo.alloc").getInt();
-    return BufferAllocation::Slice(&allocations[index], 0,
-                                   allocations[index].size());
+
+    return BufferAllocation::Slice(allocations[index], 0,
+                                   allocations[index]->size());
   }
   if (auto arg = v.dyn_cast<mlir::BlockArgument>()) {
     return BufferAllocation::Slice(
-        &allocations[GetAllocationIndex(arg, constant_name)], 0, size);
+        allocations[GetAllocationIndex(arg, constant_name)], 0, size);
   }
 
   return Unimplemented(
@@ -493,7 +495,7 @@ GetOutputDefiningDynamicUpdateSliceOps(mlir::lmhlo::FusionOp fusion) {
 
 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     mlir::lmhlo::FusionOp fusion,
-    absl::Span<const BufferAllocation> allocations) {
+    absl::Span<const BufferAllocation* const> allocations) {
   std::vector<mlir::mhlo::DynamicUpdateSliceOp> dus_ops =
       GetOutputDefiningDynamicUpdateSliceOps(fusion);
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 65440ca77aa8fd..7b4d590b1ac13d 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -111,12 +111,12 @@ std::vector<T> ToStdVector(const llvm::SmallVectorImpl<T>& v) {
 }
 
 StatusOr<BufferAllocation::Slice> GetAllocationSlice(
-    mlir::Value v, absl::Span<const BufferAllocation> allocations,
+    mlir::Value v, absl::Span<const BufferAllocation* const> allocations,
     std::string* constant_name = nullptr);
 
 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     mlir::lmhlo::FusionOp fusion,
-    absl::Span<const BufferAllocation> allocations);
+    absl::Span<const BufferAllocation* const> allocations);
 
 // Returns the dynamic-update-slice instructions defining the results of a
 // fusion node. A dynamic slice update is said to be "defining" of a result if
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 904879eb8ec36e..3e9787a645b569 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -79,15 +79,11 @@ class IrEmitterContext {
 
   std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
 
-  absl::Span<const BufferAllocation> allocations() const {
-    if (buffer_assignment_) {
-      return buffer_assignment_->Allocations();
-    }
+  absl::Span<const BufferAllocation* const> allocations() const {
     return allocations_;
   }
 
-  void set_allocations(absl::Span<const BufferAllocation> allocations) {
-    CHECK_EQ(nullptr, buffer_assignment_);
+  void set_allocations(absl::Span<const BufferAllocation* const> allocations) {
     allocations_ = allocations;
   }
 
@@ -106,7 +102,12 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  absl::Span<const BufferAllocation> allocations_;
+
+  // Stores pointer to buffer allocations in the order of the LMHLO entry args.
+  // LMHLO-based emitters need the ordering to locate the buffer allocation.
+  // This should be removed once LMHLO-based emitters are removed.
+  absl::Span<const BufferAllocation* const> allocations_;
+
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index ca90128c63a7d5..2c24214ca75716 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -316,13 +316,42 @@ Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
 }
 
 Value Maximum(ImplicitLocOpBuilder& b, ValueRange values) {
-  auto cmp = Compare(b, values, mlir::mhlo::ComparisonDirection::GT);
-  return b.create<ma::SelectOp>(cmp, values[0], values[1]);
+  // ma::MaximumFOp seems to think that max(NaN, x) = x, so we don't use that.
+  //
+  // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
+  // See also: IEEE Std 754-2008 5.11.
+  //
+  // This also works, but we wanted to make it similar to minimum.
+  // logic: isNaN(lhs) || lhs >= rhs ? lhs : rhs
+  Value lhs_is_nan =
+      Compare(b, {values[0], values[0]}, mlir::mhlo::ComparisonDirection::NE);
+  Value rhs_is_not_nan =
+      Compare(b, {values[1], values[1]}, mlir::mhlo::ComparisonDirection::EQ);
+  Value lhs_is_ge = Compare(b, values, mlir::mhlo::ComparisonDirection::GE);
+  return b.create<ma::SelectOp>(
+      b.create<ma::OrIOp>(lhs_is_nan,
+                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_ge)),
+      values[0], values[1]);
 }
 
 Value Minimum(ImplicitLocOpBuilder& b, ValueRange values) {
-  auto cmp = Compare(b, values, mlir::mhlo::ComparisonDirection::LT);
-  return b.create<ma::SelectOp>(cmp, values[0], values[1]);
+  // ma::MinimumFOp seems to think that min(NaN, x) = x, so we don't use that.
+  //
+  // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
+  // See also: IEEE Std 754-2008 5.11.
+  //
+  // This should also work, but the tests show that it doesn't work for
+  // minimum(x, NaN):
+  // logic: isNaN(lhs) || lhs <= rhs ? lhs : rhs
+  Value lhs_is_nan =
+      Compare(b, {values[0], values[0]}, mlir::mhlo::ComparisonDirection::NE);
+  Value rhs_is_not_nan =
+      Compare(b, {values[1], values[1]}, mlir::mhlo::ComparisonDirection::EQ);
+  Value lhs_is_le = Compare(b, values, mlir::mhlo::ComparisonDirection::LE);
+  return b.create<ma::SelectOp>(
+      b.create<ma::OrIOp>(lhs_is_nan,
+                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_le)),
+      values[0], values[1]);
 }
 
 // TODO(b/269489810): Contribute nicer builders to Triton, so we don't need to
@@ -456,16 +485,21 @@ Value EmitConstant(ImplicitLocOpBuilder& b, const HloInstruction& constant) {
   return CreateConst(b, ty, ScalarConstantValue<double>(constant, F64));
 }
 
+// Grouped properties of tiled dimensions used to generate block pointers.
 struct DimProperties {
-  DimProperties(int64_t index, Value offset, int block_size, int split_value)
+  DimProperties(int64_t index, Value pid, int block_size, int split_value)
       : index(index),
-        offset(offset),
+        pid(pid),
         block_size(block_size),
         split_value(split_value) {}
 
+  // Logical index of the dimension at the tiling-defining operation.
   int64_t index;
-  Value offset;
+  // Block program ID corresponding to this dimension.
+  Value pid;
+  // Elements of the dimension to process per block program.
   int block_size;
+  // Size of the major part of the dimension if it's split into two parts.
   int split_value;
 };
 
@@ -822,8 +856,7 @@ const TensorIterationSpec::DimIterationSpec* GetLhsNoncontractingSplitSpec(
 //   split-K, batch, non-contracting LHS, non-contracting RHS,
 // where split-K and batch are optional.
 struct MatMulDims {
-  MatMulDims(const AutotuneResult::TritonGemmKey& config,
-             const HloDotInstruction& dot,
+  MatMulDims(const TritonGemmConfig& config, const HloDotInstruction& dot,
              const TritonFusionAnalysis& analysis);
 
   std::optional<int> out_split_k_dim_idx = std::nullopt;
@@ -851,7 +884,7 @@ struct MatMulDims {
 
 // Structure for parameters relating to the MatMul launch grid.
 struct MatMulLaunchConfig {
-  explicit MatMulLaunchConfig(const AutotuneResult::TritonGemmKey& config,
+  explicit MatMulLaunchConfig(const TritonGemmConfig& config,
                               const HloDotInstruction& dot,
                               const MatMulDims& dims);
 
@@ -862,15 +895,15 @@ struct MatMulLaunchConfig {
   mt::ProgramIDDim noncontracting_program_id_dim;
 };
 
-MatMulDims::MatMulDims(const AutotuneResult::TritonGemmKey& config,
+MatMulDims::MatMulDims(const TritonGemmConfig& config,
                        const HloDotInstruction& dot,
                        const TritonFusionAnalysis& analysis) {
-  if (config.split_k() > 1) {
+  if (config.split_k > 1) {
     // split-k is always the first logical dimension.
     out_split_k_dim_idx = 0;
   }
 
-  int64_t num_split_k_dims = config.split_k() > 1 ? 1 : 0;
+  int64_t num_split_k_dims = config.split_k > 1 ? 1 : 0;
   const auto& dims = dot.dot_dimension_numbers();
   lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
   lhs_noncontracting_dim_idx =
@@ -906,7 +939,7 @@ MatMulDims::MatMulDims(const AutotuneResult::TritonGemmKey& config,
           ->at(0)
           .count;
   // Contracting dimension length.
-  if (config.split_k() > 1 &&
+  if (config.split_k > 1 &&
       dot.operand(0)->operand(0)->opcode() == HloOpcode::kPad) {
     // Unpadded LHS shape:  [..., k, ...]
     // Padded LHS shape:    [..., padded_k, ...]
@@ -917,7 +950,7 @@ MatMulDims::MatMulDims(const AutotuneResult::TritonGemmKey& config,
     k = unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
   } else {
     k = dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
-        config.split_k();
+        config.split_k;
   }
 
   auto* lhs_noncontracting_split_spec =
@@ -943,11 +976,11 @@ MatMulDims::MatMulDims(const AutotuneResult::TritonGemmKey& config,
   CHECK_GE(n, 1);
 }
 
-MatMulLaunchConfig::MatMulLaunchConfig(
-    const AutotuneResult::TritonGemmKey& config, const HloDotInstruction& dot,
-    const MatMulDims& dims)
-    : grid_m((dims.m + config.block_m() - 1) / config.block_m()),
-      grid_n((dims.n + config.block_n() - 1) / config.block_n()) {
+MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
+                                       const HloDotInstruction& dot,
+                                       const MatMulDims& dims)
+    : grid_m((dims.m + config.block_m - 1) / config.block_m),
+      grid_n((dims.n + config.block_n - 1) / config.block_n) {
   int64_t batch_size = dims.lhs_noncontracting_split.value_or(
       dims.out_batch_dim_idx.has_value()
           ? dot.shape().dimensions(*dims.out_batch_dim_idx)
@@ -965,29 +998,29 @@ MatMulLaunchConfig::MatMulLaunchConfig(
   if (large_batch) {
     batch_program_id_dim = mt::ProgramIDDim::X;
     noncontracting_program_id_dim = mt::ProgramIDDim::Y;
-    launch_dims = {{batch_size, grid_m * grid_n, config.split_k()},
-                   {config.num_warps() * WarpSize(), 1, 1}};
+    launch_dims = {{batch_size, grid_m * grid_n, config.split_k},
+                   {config.num_warps * WarpSize(), 1, 1}};
   } else {
     batch_program_id_dim = mt::ProgramIDDim::Y;
     noncontracting_program_id_dim = mt::ProgramIDDim::X;
     launch_dims =
-        LaunchDimensions{{grid_m * grid_n, batch_size, config.split_k()},
-                         {config.num_warps() * WarpSize(), 1, 1}};
+        LaunchDimensions{{grid_m * grid_n, batch_size, config.split_k},
+                         {config.num_warps * WarpSize(), 1, 1}};
   }
 }
 
-void ValidateMatMulConfig(const AutotuneResult::TritonGemmKey& config,
+void ValidateMatMulConfig(const TritonGemmConfig& config,
                           const HloDotInstruction& dot) {
-  CHECK_GE(config.split_k(), 1);
-  CHECK_GE(config.block_m(), 16);
-  CHECK_GE(config.block_k(), 16);
-  CHECK_GE(config.block_n(), 16);
+  CHECK_GE(config.split_k, 1);
+  CHECK_GE(config.block_m, 16);
+  CHECK_GE(config.block_k, 16);
+  CHECK_GE(config.block_n, 16);
 
   const auto& dims = dot.dot_dimension_numbers();
   int num_batch_dims =
-      dims.lhs_batch_dimensions_size() - (config.split_k() > 1 ? 1 : 0);
+      dims.lhs_batch_dimensions_size() - (config.split_k > 1 ? 1 : 0);
   CHECK_LE(num_batch_dims, 1);
-  if (config.split_k() > 1) {
+  if (config.split_k > 1) {
     // Split-K dimension has to be the first batch one and have an index
     // just before the contracting one.
     const int lhs_split_k_dim_idx = dims.lhs_contracting_dimensions(0) - 1;
@@ -995,9 +1028,9 @@ void ValidateMatMulConfig(const AutotuneResult::TritonGemmKey& config,
     // Size of this dimension has to match the split_k value.
     CHECK_EQ(dims.lhs_batch_dimensions(0), lhs_split_k_dim_idx);
     CHECK_EQ(dims.rhs_batch_dimensions(0), rhs_split_k_dim_idx);
-    CHECK_EQ(config.split_k(),
+    CHECK_EQ(config.split_k,
              dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
-    CHECK_EQ(config.split_k(),
+    CHECK_EQ(config.split_k,
              dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
   }
 
@@ -1007,7 +1040,7 @@ void ValidateMatMulConfig(const AutotuneResult::TritonGemmKey& config,
   CHECK_EQ(dims.rhs_contracting_dimensions_size(), 1);
 
   CHECK_EQ(dot.operand(0)->shape().rank(),
-           2 + (config.split_k() > 1 ? 1 : 0) + num_batch_dims);
+           2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
 }
 
 struct Side {
@@ -1113,7 +1146,6 @@ class MatMulEmitterHelper {
       if (spec == nullptr) {
         return;
       }
-      const int64_t stride = spec->at(0).stride;
       int64_t count = spec->at(0).count;
       if (side.scope == TritonFusionAnalysis::Scope::OUTPUT &&
           properties.index == dims_.out_lhs_noncontracting_dim_idx &&
@@ -1126,8 +1158,12 @@ class MatMulEmitterHelper {
         boundary_checks.push_back(bounds.size());
       }
       bounds.push_back(Cst64(count));
-      strides.push_back(Cst64(stride));
-      block_offsets.push_back(properties.offset);
+      strides.push_back(Cst64(spec->at(0).stride));
+      block_offsets.push_back(
+          (properties.pid == nullptr)
+              ? Cst32(0)
+              : b_.create<ma::MulIOp>(properties.pid,
+                                      Cst32(properties.block_size)));
       tensor_offsets.push_back(Cst32(spec->at(0).slice_start));
       block_dims.push_back(properties.block_size);
       dim_order.emplace(dim_order.begin(), dim_order.size());
@@ -1176,10 +1212,10 @@ class MatMulEmitterHelper {
       const TensorIterationSpec::DimIterationSpec* spec = analysis_.IterSpec(
           TritonFusionAnalysis::Scope::OUTPUT, hlo, *dims_.out_split_k_dim_idx);
       if (spec != nullptr) {
-        int64_t stride_split_k = spec->at(0).stride;
-        Value offset_split_k =
-            b_.create<ma::MulIOp>(ConvertScalar(pid_k), Cst(stride_split_k));
-        base = AddPtr(b_, base, offset_split_k);
+        CHECK(pid_k != nullptr);
+        base = AddPtr(b_, base,
+                      b_.create<ma::MulIOp>(ConvertScalar(pid_k),
+                                            Cst(spec->at(0).stride)));
       }
     }
 
@@ -1225,8 +1261,7 @@ class MatMulEmitterHelper {
 LaunchDimensions GetMatMulLaunchDimensions(
     const TritonFusionAnalysis& analysis,
     absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary,
-    const AutotuneResult::TritonGemmKey& config) {
+    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config) {
   const auto* dot = static_cast<const HloDotInstruction*>(
       HloFindIf(roots, fusion_boundary, [](const HloInstruction& node) {
         return node.opcode() == HloOpcode::kDot;
@@ -1241,8 +1276,7 @@ LaunchDimensions GetMatMulLaunchDimensions(
 Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
                   const TritonFusionAnalysis& analysis,
                   const HloComputation* computation, mlir::triton::FuncOp fn,
-                  const AutotuneResult::TritonGemmKey& config,
-                  int shmem_budget) {
+                  const TritonGemmConfig& config, int shmem_budget) {
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
   // Use 32-bit indexing if addressing any of the inputs or the output (which
@@ -1251,7 +1285,7 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
   bool use_64bit_indexing =
       ShapeUtil::ElementsIn(dot_instr->operand(0)->shape()) > INT_MAX ||
       ShapeUtil::ElementsIn(dot_instr->operand(1)->shape()) > INT_MAX ||
-      ShapeUtil::ElementsIn(dot_instr->shape()) * config.split_k() > INT_MAX;
+      ShapeUtil::ElementsIn(dot_instr->shape()) * config.split_k > INT_MAX;
   Type index_ty = builder.getIntegerType(use_64bit_indexing ? 64 : 32);
 
   const HloInstruction* root = dot_instr->parent()->root_instruction();
@@ -1265,10 +1299,10 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
   Type i32_ty = b.getI32Type();
 
   ValidateMatMulConfig(config, *dot_instr);
-  const int split_k = config.split_k();
-  const int block_m = config.block_m();
-  const int block_k = config.block_k();
-  const int block_n = config.block_n();
+  const int split_k = config.split_k;
+  const int block_m = config.block_m;
+  const int block_k = config.block_k;
+  const int block_n = config.block_n;
 
   const MatMulDims dims(config, *dot_instr, analysis);
   const MatMulLaunchConfig launch_config(config, *dot_instr, dims);
@@ -1284,7 +1318,9 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
 
   auto pid_nc =
       b.create<mt::GetProgramIdOp>(launch_config.noncontracting_program_id_dim);
-  auto pid_k = b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::Z);
+  Value pid_k = (split_k > 1)
+                    ? b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::Z)
+                    : Value{};
 
   auto group_id = b.create<ma::DivSIOp>(pid_nc, c32(width));
   ma::ConstantOp group_m_op = c32(group_m);
@@ -1296,13 +1332,8 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
 
   auto pid_m = b.create<ma::AddIOp>(first_pid_m,
                                     b.create<ma::RemSIOp>(pid_nc, group_size));
-  auto pid_m_offset = b.create<ma::MulIOp>(pid_m, c32(block_m));
-
   auto pid_n = b.create<ma::DivSIOp>(b.create<ma::RemSIOp>(pid_nc, c32(width)),
                                      group_size);
-  auto pid_n_offset = b.create<ma::MulIOp>(pid_n, c32(block_n));
-
-  auto pid_k_offset = b.create<ma::MulIOp>(pid_k, c32(block_k));
 
   mlir::FloatType acc_ty = emitter.GetDotAccumulatorType();
 
@@ -1314,26 +1345,26 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
   absl::flat_hash_map<int, const HloInstruction*> iter_args_to_parameters;
   absl::flat_hash_map<int, std::vector<int32_t>> iter_args_to_boundary_checks;
 
-  Side lhs{TritonFusionAnalysis::Scope::LHS,
-           /*tiled_dims=*/
-           {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m_offset,
-                          block_m, /*split_value=*/1),
-            DimProperties(dims.lhs_contracting_dim_idx, pid_k_offset, block_k,
-                          split_k)},
-           dims.lhs_batch_dim_idx};
-  Side rhs{TritonFusionAnalysis::Scope::RHS,
-           /*tiled_dims=*/
-           {DimProperties(dims.rhs_contracting_dim_idx, pid_k_offset, block_k,
-                          split_k),
-            DimProperties(dims.rhs_noncontracting_dim_idx, pid_n_offset,
-                          block_n, /*split_value=*/1)},
-           dims.rhs_batch_dim_idx};
+  Side lhs{
+      TritonFusionAnalysis::Scope::LHS,
+      /*tiled_dims=*/
+      {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m, block_m,
+                     /*split_value=*/1),
+       DimProperties(dims.lhs_contracting_dim_idx, pid_k, block_k, split_k)},
+      dims.lhs_batch_dim_idx};
+  Side rhs{
+      TritonFusionAnalysis::Scope::RHS,
+      /*tiled_dims=*/
+      {DimProperties(dims.rhs_contracting_dim_idx, pid_k, block_k, split_k),
+       DimProperties(dims.rhs_noncontracting_dim_idx, pid_n, block_n,
+                     /*split_value=*/1)},
+      dims.rhs_batch_dim_idx};
   Side out{TritonFusionAnalysis::Scope::OUTPUT,
            /*tiled_dims=*/
-           {DimProperties(dims.out_lhs_noncontracting_dim_idx, pid_m_offset,
-                          block_m, /*split_value=*/1),
-            DimProperties(dims.out_rhs_noncontracting_dim_idx, pid_n_offset,
-                          block_n, /*split_value=*/1)},
+           {DimProperties(dims.out_lhs_noncontracting_dim_idx, pid_m, block_m,
+                          /*split_value=*/1),
+            DimProperties(dims.out_rhs_noncontracting_dim_idx, pid_n, block_n,
+                          /*split_value=*/1)},
            dims.out_batch_dim_idx};
 
   auto body_builder = [&](mlir::OpBuilder&, mlir::Location, Value ki,
@@ -1389,10 +1420,14 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
     if (need_masking) {
       auto elements_in_tile =
           b.create<ma::SubIOp>(CreateConst(b, i32_ty, dims.k), ki);
-      auto range_k = b.create<ma::AddIOp>(
-          Splat(b, b.create<ma::MulIOp>(pid_k, CreateConst(b, i32_ty, block_k)),
-                block_k),
-          Range(b, block_k));
+      auto range_k = Range(b, block_k);
+      if (pid_k != nullptr) {
+        range_k = b.create<ma::AddIOp>(
+            range_k,
+            Splat(b,
+                  b.create<ma::MulIOp>(pid_k, CreateConst(b, i32_ty, block_k)),
+                  block_k));
+      }
       auto apply_mask = [&](int64_t dim, Value input) {
         auto ty = input.getType().cast<mlir::RankedTensorType>();
         TensorValue range_expanded = b.create<mt::ExpandDimsOp>(range_k, dim)
@@ -1496,8 +1531,7 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
 
 LaunchDimensions GetSoftMaxLaunchDimensions(
     absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary,
-    const AutotuneResult::TritonGemmKey& config) {
+    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config) {
   const HloInstruction* reduce =
       HloFindIf(roots, fusion_boundary, [](const HloInstruction& node) {
         return node.opcode() == HloOpcode::kReduce;
@@ -1510,13 +1544,13 @@ LaunchDimensions GetSoftMaxLaunchDimensions(
     num_rows *= reduce_input_shape.dimensions_minor(minor_axis);
   }
 
-  return {{num_rows, 1, 1}, {config.num_warps() * WarpSize(), 1, 1}};
+  return {{num_rows, 1, 1}, {config.num_warps * WarpSize(), 1, 1}};
 }
 
 Status EmitSoftMax(mlir::OpBuilder builder, absl::string_view libdevice_path,
                    const TritonFusionAnalysis& analysis,
                    const HloComputation* computation, mlir::triton::FuncOp fn,
-                   const AutotuneResult::TritonGemmKey& config, int) {
+                   const TritonGemmConfig& config, int) {
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
   ImplicitLocOpBuilder b(loc, builder);
@@ -1547,40 +1581,40 @@ Status EmitSoftMax(mlir::OpBuilder builder, absl::string_view libdevice_path,
   CHECK_EQ(reduce->dimensions()[0], reduce_input_shape.rank() - 1);
 
   int row_len = reduce_input_shape.dimensions_minor(0);
-  int block_row = 1;
+  int block_size = 1;
 
-  // block_row must be a power of two.
-  while (block_row < row_len) {
-    block_row *= 2;
+  // block_size must be a power of two.
+  while (block_size < row_len) {
+    block_size *= 2;
   }
 
-  Value row_index = b.create<ma::ExtSIOp>(
+  Value pid = b.create<ma::ExtSIOp>(
       b.getI64Type(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
   Value row_stride = CreateConst(b, b.getI32Type(), row_len);
 
   absl::flat_hash_map<const HloInstruction*, Value> values_out;
   auto make_tensor_pointer = [&](Value base) {
     Value offset = b.create<ma::MulIOp>(
-        row_index, b.create<ma::ExtSIOp>(b.getI64Type(), row_stride));
+        pid, b.create<ma::ExtSIOp>(b.getI64Type(), row_stride));
     return b.create<mt::MakeTensorPtrOp>(
         /*base=*/AddPtr(b, base, offset),
         /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
         /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
         /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
-        /*tensorShape=*/std::vector<int32_t>{block_row},
+        /*tensorShape=*/std::vector<int32_t>{block_size},
         /*order=*/std::vector<int32_t>{0});
   };
 
   std::vector<int32_t> boundary_checks;
-  if (block_row != row_len) {
+  if (block_size != row_len) {
     boundary_checks.push_back(0);
   }
   values_out[computation->parameter_instruction(0)] = EmitParameterLoad(
       b, make_tensor_pointer(fn.getArgument(0)), boundary_checks);
   // Dimension 0 is the reduced one by construction and it's the only one
   // present in the tile shapes.
-  std::vector<DimProperties> tiled_dims = {
-      DimProperties(0, row_index, block_row, /*split_value=*/1)};
+  std::vector<DimProperties> tiled_dims = {DimProperties(
+      /*index=*/0, pid, block_size, /*split_value=*/1)};
   TF_ASSIGN_OR_RETURN(
       Value result,
       EmitScope(b, libdevice_path, &analysis,
@@ -1640,9 +1674,8 @@ std::string GetLibdevicePath(const HloComputation* hlo_computation) {
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation,
-    const se::DeviceDescription& device_info,
-    const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
-    mlir::MLIRContext& mlir_context) {
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
   mlir_context.loadDialect<mt::TritonDialect>();
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
@@ -1693,9 +1726,9 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, absl::string_view fusion_kind,
     const se::CudaComputeCapability& cc,
-    const se::DeviceDescription& device_info,
-    const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
-    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context) {
   if (fusion_kind == kTritonGemmFusionKind) {
     // This is a heuristic that serves as a proxy for register usage and code
     // size.
@@ -1722,9 +1755,9 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     // See go/tiling-heuristic for more details.
     constexpr int64_t kComplexityHeuristicLimit = 9000;
     int64_t complexity_heuristic_value =
-        (config.block_m() * config.block_n() +
-         (config.block_m() + config.block_n()) * config.block_k()) /
-        config.num_warps();
+        (config.block_m * config.block_n +
+         (config.block_m + config.block_n) * config.block_k) /
+        config.num_warps;
     VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
     if (complexity_heuristic_value > kComplexityHeuristicLimit) {
       return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
@@ -1739,7 +1772,7 @@ StatusOr<TritonWrapperResult> TritonWrapper(
                          config, ir_emitter, mlir_context));
 
   VLOG(3) << hlo_computation->ToString(HloPrintOptions::ShortParsable());
-  VLOG(2) << config.ShortDebugString();
+  VLOG(2) << config.ToString();
 
   // Compile Triton kernel to LLVM.
   std::optional<llvm::raw_fd_ostream> log_stream;
@@ -1785,7 +1818,7 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     }
   }
 
-  CreateTritonPipeline(pm, cc, config.num_warps(), config.num_stages());
+  CreateTritonPipeline(pm, cc, config.num_warps, config.num_stages);
   if (log_stream.has_value()) {
     pm.printAsTextualPipeline(log_stream.value());
     log_stream->write("\n\n", 2);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 9edf27853e22fd..52d66dd9c8c9a0 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -44,32 +45,27 @@ struct TritonWrapperResult {
 LaunchDimensions GetMatMulLaunchDimensions(
     const TritonFusionAnalysis& analysis,
     absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary,
-    const AutotuneResult::TritonGemmKey& config);
+    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config);
 // Use tiling and execution parameters from 'config'.
 Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                   const TritonFusionAnalysis& analysis,
                   const HloComputation* computation, mlir::triton::FuncOp fn,
-                  const AutotuneResult::TritonGemmKey& config,
-                  int shmem_budget);
+                  const TritonGemmConfig& config, int shmem_budget);
 
 // Compute the launch dimensions for the given Triton SoftMax.
 LaunchDimensions GetSoftMaxLaunchDimensions(
     absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary,
-    const AutotuneResult::TritonGemmKey& config);
+    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config);
 // Generate Softmax in Triton IR inside 'fn'.
 // Use execution parameters from 'config'.
 Status EmitSoftMax(mlir::OpBuilder b, absl::string_view libdevice_path,
                    const TritonFusionAnalysis& analysis,
                    const HloComputation* computation, mlir::triton::FuncOp fn,
-                   const AutotuneResult::TritonGemmKey& config,
-                   int shmem_budget);
+                   const TritonGemmConfig& config, int shmem_budget);
 
 using TritonIrEmitter = std::function<Status(
     mlir::OpBuilder, absl::string_view, const TritonFusionAnalysis& analysis,
-    const HloComputation*, mlir::triton::FuncOp,
-    const AutotuneResult::TritonGemmKey&, int)>;
+    const HloComputation*, mlir::triton::FuncOp, const TritonGemmConfig&, int)>;
 
 // Generate Triton IR by running the provided generator and compile it into LLVM
 // IR.
@@ -78,18 +74,17 @@ StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation, absl::string_view fusion_kind,
     const se::CudaComputeCapability& cc,
-    const se::DeviceDescription& device_info,
-    const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
-    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
+    mlir::MLIRContext& mlir_context);
 
 // Creates the initial Triton module for the given fusion. Visible for testing,
 // use TritonWrapper instead.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation,
-    const se::DeviceDescription& device_info,
-    const AutotuneResult::TritonGemmKey& config, TritonIrEmitter ir_emitter,
-    mlir::MLIRContext& mlir_context);
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
index 2fc8733027ac26..c5d98bb87f1d59 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
@@ -41,9 +41,10 @@ HloModule r
 ENTRY e {
   arg0 = f16[65536,32800] parameter(0)
   arg1 = f16[32800,32] parameter(1)
-  ROOT custom-call = f16[65536,32] custom-call(arg0, arg1),
+  gemm = (f16[65536,32], s8[0]) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+  ROOT get-tuple-element = f16[65536,32] get-tuple-element((f16[65536,32], s8[0]) gemm), index=0
 }
 )";
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 24c686949281ac..ff027d0e35ba5d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -210,13 +210,14 @@ ENTRY e {
   p1 = $0[33,68]{1,0} parameter(1)
   p0 = f32[15,33]{1,0} parameter(0)
   fusion = f32[33,68]{1,0} fusion(p1), kind=kLoop, calls=fused_computation
-  ROOT custom-call = f32[15,68]{1,0} custom-call(p0, fusion),
+  gemm = (f32[15,68]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
+   ROOT get-tuple-element = f32[15,68]{1,0} get-tuple-element((f32[15,68]{1,0}, s8[0]{0}) gemm), index=0
 })";
   const std::string hlo_ref = absl::Substitute(
       kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -322,13 +323,14 @@ ENTRY e {
   p1 = $0[11,63]{1,0} parameter(1)
   p0 = f32[92,11]{1,0} parameter(0)
   fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  ROOT custom-call = f32[92,63]{1,0} custom-call(p0, fusion),
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
 })";
   const std::string hlo_ref = absl::Substitute(
       kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -448,13 +450,14 @@ ENTRY e {
   p1 = $0[11,63]{1,0} parameter(1)
   p0 = f32[92,11]{1,0} parameter(0)
   fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  ROOT custom-call = f32[92,63]{1,0} custom-call(p0, fusion),
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
 })";
   const std::string hlo_ref = absl::Substitute(
       kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -555,13 +558,14 @@ ENTRY e {
   p0 = $1[92,11]{1,0} parameter(0)
   fusion = $1[11,63]{1,0} fusion(p1, p2, p3), kind=kLoop,
     calls=fused_computation
-  ROOT custom-call = $1[92,63]{1,0} custom-call(p0, fusion),
+  gemm = ($1[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = $1[92,63]{1,0} get-tuple-element(($1[92,63]{1,0}, s8[0]{0}) gemm), index=0
 })";
   const std::string hlo_ref = absl::Substitute(
       kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
@@ -668,13 +672,14 @@ ENTRY e {
   p0 = f32[92,11]{1,0} parameter(0)
   fusion = f32[11,63]{1,0} fusion(p1), kind=kLoop,
     calls=fused_computation
-  ROOT custom-call = f32[92,63]{1,0} custom-call(p0, fusion),
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[92,63]{1, 0} get-tuple-element((f32[92,63]{1, 0}, s8[0]{0}) gemm), index=0
 })";
   const std::string hlo_ref = absl::Substitute(
       kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 63ba010975f8ff..148977393bc81f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
@@ -87,13 +88,13 @@ class TritonGemmTestWithoutTritonGemmAny : public TritonGemmTest {
 class TritonFilecheckTest : public TritonGemmTest {
  public:
   StatusOr<bool> CreateTritonIrAndFileCheck(
-      absl::string_view hlo_text, const AutotuneResult::TritonGemmKey& config,
+      absl::string_view hlo_text, const TritonGemmConfig& config,
       TritonIrEmitter emitter, absl::string_view triton_fusion_name,
       absl::string_view filecheck_pattern);
 };
 
 StatusOr<bool> TritonFilecheckTest::CreateTritonIrAndFileCheck(
-    absl::string_view hlo_text, const AutotuneResult::TritonGemmKey& config,
+    absl::string_view hlo_text, const TritonGemmConfig& config,
     TritonIrEmitter emitter, absl::string_view triton_fusion_name,
     absl::string_view filecheck_pattern) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> verified_module,
@@ -138,12 +139,7 @@ ENTRY e {
     calls=triton_gemm_r,
     backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}
 })";
-  AutotuneResult::TritonGemmKey config;
-  config.set_split_k(1);
-  config.set_block_m(16);
-  config.set_block_k(32);
-  config.set_block_n(64);
-
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
                                          "triton_gemm_r", R"(
 CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
@@ -162,7 +158,6 @@ CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 5 : i32
 CHECK-DAG:  %[[GROUP_M:.*]] = arith.constant 8 : i32
 CHECK-DAG:  %[[WIDTH:.*]] = arith.constant 24 : i32
 CHECK:      %[[PID_NC:.*]] = tt.get_program_id x
-CHECK:      %[[PID_K:.*]] = tt.get_program_id z
 CHECK:      %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[WIDTH]]
 CHECK:      %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[GROUP_M]]
 CHECK:      %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
@@ -170,15 +165,14 @@ CHECK:      %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[GROUP_M]]
 CHECK:      %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[GROUP_M]]
 CHECK:      %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
 CHECK:      %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]] : i32
-CHECK:      %[[TILE_OFFSET_M:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
 CHECK:      %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[WIDTH]] : i32
 CHECK:      %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]] : i32
-CHECK:      %[[TILE_OFFSET_N:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
-CHECK:      %[[TILE_OFFSET_K:.*]] = arith.muli %[[PID_K]], %[[TILE_SIZE_K]]
+CHECK:      %[[TILE_OFFSET_M_LHS:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
 CHECK:      %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
-CHECK:      %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M]], %[[TILE_OFFSET_K]]]
+CHECK:      %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[C0]]]
+CHECK:      %[[TILE_OFFSET_N_RHS:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
 CHECK:      %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
-CHECK:      %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[TILE_OFFSET_K]], %[[TILE_OFFSET_N]]]
+CHECK:      %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[C0]], %[[TILE_OFFSET_N_RHS]]]
 CHECK:        %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
 CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
 CHECK:        %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 1>
@@ -187,16 +181,13 @@ CHECK:        %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = arra
 CHECK:        %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
 CHECK:        %[[CONVERTED:.*]] = arith.sitofp %[[LHS_TILE]] : tensor<16x32xi8> to tensor<16x32xf32>
 CHECK:        %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
-CHECK:        %[[K_OFFSET:.*]] = arith.muli %[[PID_K]], %[[TILE_SIZE_K]] : i32
-CHECK:        %[[K_OFFSET_SPLAT_K:.*]] = tt.splat %[[K_OFFSET]] : (i32) -> tensor<32xi32>
 CHECK:        %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
-CHECK:        %[[K_OFFSETS:.*]] = arith.addi %[[K_OFFSET_SPLAT_K]], %[[K_TILE_IOTA]] : tensor<32xi32>
-CHECK:        %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_OFFSETS]] {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32>
+CHECK:        %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32>
 CHECK:        %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : (i32) -> tensor<1x32xi32>
 CHECK:        %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
 CHECK:        %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : (tensor<1x32xi1>) -> tensor<16x32xi1>
 CHECK:        %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[CONVERTED]], %[[ZERO_MK]]
-CHECK:        %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_OFFSETS]] {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
+CHECK:        %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
 CHECK:        %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : (i32) -> tensor<32x1xi32>
 CHECK:        %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
 CHECK:        %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : (tensor<32x1xi1>) -> tensor<32x64xi1>
@@ -204,8 +195,10 @@ CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TIL
 CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
 CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>, 1>, !tt.ptr<tensor<32x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      }
+CHECK:      %[[TILE_OFFSET_M_OUT:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
+CHECK:      %[[TILE_OFFSET_N_OUT:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
 CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>, 1>
-CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M]], %[[TILE_OFFSET_N]]] : <tensor<16x64xf32>, 1>
+CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_OUT]], %[[TILE_OFFSET_N_OUT]]] : <tensor<16x64xf32>, 1>
 CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      tt.return
 CHECK:    }
@@ -334,13 +327,7 @@ ENTRY entry {
   llvm::Module llvm_module("module", llvm_ctx);
   mlir::MLIRContext mlir_context;
 
-  AutotuneResult::TritonGemmKey config;
-  config.set_block_m(16);
-  config.set_block_n(32);
-  config.set_block_k(512);
-  config.set_split_k(1);
-  config.set_num_stages(4);
-  config.set_num_warps(8);
+  TritonGemmConfig config(16, 32, 512, 1, 4, 8);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, kTritonGemmFusionKind,
@@ -350,10 +337,10 @@ ENTRY entry {
       tsl::testing::StatusIs(tsl::error::RESOURCE_EXHAUSTED,
                              "Shared memory size limit exceeded."));
 
-  config.set_block_m(64);
-  config.set_block_n(128);
-  config.set_block_k(128);
-  config.set_num_stages(1);
+  config.block_m = 64;
+  config.block_n = 128;
+  config.block_k = 128;
+  config.num_stages = 1;
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
@@ -792,13 +779,7 @@ ENTRY entry {
   mlir::MLIRContext mlir_context;
 
   // Fails if the tiling is too complex.
-  AutotuneResult::TritonGemmKey config;
-  config.set_block_m(512);
-  config.set_block_n(512);
-  config.set_block_k(32);
-  config.set_split_k(1);
-  config.set_num_stages(1);
-  config.set_num_warps(2);
+  TritonGemmConfig config(512, 512, 32, 1, 1, 2);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, kTritonGemmFusionKind,
@@ -810,9 +791,9 @@ ENTRY entry {
           "Tiling complexity heuristic exceeded: 147456 > 9000"));
 
   // Succeeds if the tiling is not too complex.
-  config.set_block_m(32);
-  config.set_block_n(32);
-  config.set_block_k(32);
+  config.block_m = 32;
+  config.block_n = 32;
+  config.block_k = 32;
   TF_CHECK_OK(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, kTritonGemmFusionKind,
@@ -995,6 +976,15 @@ class TritonGemmLevel2Test : public TritonGemmTest {
   }
 };
 
+class TritonGemmLevel2TestAny : public TritonGemmLevel2Test {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = TritonGemmLevel2Test::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_triton_gemm_any(true);
+    return debug_options;
+  }
+};
+
 TEST_F(TritonGemmLevel2Test, BinaryOperationWithSmallInputsIsFused) {
   const std::string kHloText = R"(
 HloModule m
@@ -1271,6 +1261,198 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
+TEST_F(TritonGemmLevel2TestAny, MinimumHandlesNaNsOnTheLeft) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  min = f32[5,5] minimum(nans, neg1s)
+  ROOT _ = f32[5,5] dot(p0, min),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MinimumHandlesNaNsOnTheRight) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  min = f32[5,5] minimum(neg1s, nans)
+  ROOT _ = f32[5,5] dot(p0, min),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MaximumHandlesNaNsOnTheLeft) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  max = f32[5,5] maximum(nans, neg1s)
+  ROOT _ = f32[5,5] dot(p0, max),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MaximumHandlesNaNsOnTheRight) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  neg1 = f32[] constant(-1)
+  neg1s = f32[5,5] broadcast(neg1), dimensions={}
+  nans = f32[5,5] sqrt(neg1s)
+  max = f32[5,5] maximum(neg1s, nans)
+  ROOT _ = f32[5,5] dot(p0, max),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MinimumReturnsLHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  min = f32[5,5] minimum(zeros, ones)
+  ROOT _ = f32[5,5] dot(p0, min),
+  lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MinimumReturnsRHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  min = f32[5,5] minimum(ones, zeros)
+  ROOT _ = f32[5,5] dot(p0, min),
+  lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MaximumReturnsLHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  max = f32[5,5] maximum(ones, zeros)
+  ROOT _ = f32[5,5] dot(p0, max),
+  lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmLevel2TestAny, MaximumReturnsRHS) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,5] parameter(0)
+  zero = f32[] constant(0)
+  zeros = f32[5,5] broadcast(zero), dimensions={}
+  one = f32[] constant(1)
+  ones = f32[5,5] broadcast(one), dimensions={}
+  max = f32[5,5] maximum(zeros, ones)
+  ROOT _ = f32[5,5] dot(p0, max),
+  lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: block_m
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, SineOutputIsNotFused) {
   const std::string kHloText = R"(
 HloModule m
@@ -1623,9 +1805,10 @@ HloModule r
 ENTRY e {
   arg0 = f16[5,7] parameter(0)
   arg1 = f16[7,33] parameter(1)
-  ROOT custom-call = f16[5,33] custom-call(arg0, arg1),
+  gemm = (f16[5,33], s8[0]{0}) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f16[5,33]{1,0} get-tuple-element((f16[5,33]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -1659,9 +1842,10 @@ HloModule r
 ENTRY e {
   arg0 = f32[5,7] parameter(0)
   arg1 = f32[7,33] parameter(1)
-  ROOT custom-call = f32[5,33] custom-call(arg0, arg1),
+  gemm = (f32[5,33], s8[0]{0}) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[5,33]{1,0} get-tuple-element((f32[5,33]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -1700,9 +1884,10 @@ HloModule r
 ENTRY e {
   arg0 = bf16[512,16]{1,0} parameter(0)
   arg1 = bf16[512,256]{1,0} parameter(1)
-  ROOT custom-call = bf16[16,256]{1,0} custom-call(arg0, arg1),
+  gemm = (bf16[16,256]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = bf16[16,256]{1,0} get-tuple-element((bf16[16,256]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -1778,8 +1963,8 @@ ENTRY e {
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, kTritonGemmFusionKind,
                     GetCudaComputeCapability(), dev_info,
-                    config.triton_gemm_config(), &llvm_module, &EmitMatMul,
-                    mlir_context));
+                    TritonGemmConfig::FromProto(config.triton_gemm_config()),
+                    &llvm_module, &EmitMatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / optin shared memory so that any GPU that
   // has the optin one should be able to execute the test.
@@ -1817,9 +2002,10 @@ HloModule r
 ENTRY e {
   arg0 = f16[128,32]{1,0} parameter(0)
   arg1 = f16[64,32]{1,0} parameter(1)
-  ROOT custom-call = f16[128,64]{1,0} custom-call(arg0, arg1),
+  gemm = (f16[128,64]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f16[128,64]{1,0} get-tuple-element((f16[128,64]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -1853,9 +2039,10 @@ HloModule r
 ENTRY e {
   arg0 = f32[64,128]{1,0} parameter(0)
   arg1 = f32[1024,64]{1,0} parameter(1)
-  ROOT custom-call = f32[128,1024]{1,0} custom-call(arg0, arg1),
+  gemm = (f32[128,1024]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[128,1024]{1,0} get-tuple-element((f32[128,1024]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -1899,9 +2086,10 @@ ENTRY e {
   p0 = s8[144,256]{1,0} parameter(0)
   fusion = bf16[144,256]{1,0} fusion(p0), kind=kInput, calls=fused_computation
   p1 = bf16[256,122]{1,0} parameter(1)
-  ROOT custom-call = bf16[144,122]{1,0} custom-call(fusion, p1),
+  gemm = (bf16[144,122]{1,0}, s8[0]{0}) custom-call(fusion, p1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = bf16[144,122]{1,0} get-tuple-element((bf16[144,122]{1,0}, s8[0]{0}) gemm), index=0
 }
 )";
 
@@ -2439,9 +2627,10 @@ ENTRY e {
   constant_2 = f32[] constant(321)
   parameter_0 = f32[92,11]{1,0} parameter(0)
   broadcast.2 = f32[11,63]{1,0} broadcast(constant_2), dimensions={}
-  ROOT custom-call = f32[63,92]{1,0} custom-call(broadcast.2, parameter_0),
+  gemm = (f32[63,92]{1,0}, s8[0]{0}) custom-call(broadcast.2, parameter_0),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["1"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[63,92]{1,0} get-tuple-element((f32[63,92]{1,0}, s8[0]{0}) gemm), index=0
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
@@ -2479,9 +2668,10 @@ ENTRY triton_gemm___computation {
   constant = f32[] constant(123)
   broadcast = f32[11,63]{1,0} broadcast(constant), dimensions={}
   broadcast.1 = f32[63,45]{1,0} broadcast(constant_1), dimensions={}
-  ROOT custom-call = f32[11,45]{1,0} custom-call(broadcast, broadcast.1),
+  gemm = (f32[11,45]{1,0}, s8[0]{0}) custom-call(broadcast, broadcast.1),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[11,45]{1,0} get-tuple-element((f32[11,45]{1,0}, s8[0]{0}) gemm), index=0
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
@@ -2584,9 +2774,10 @@ ENTRY e {
   tmp_1 = pred[3,32]{1,0} parameter(0)
   fusion.1 = f32[3,32]{1,0} fusion(tmp_7, tmp_5, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=fused_computation.1
   fusion = f32[3,57]{1,0} fusion(tmp_18, tmp_14, tmp_15, tmp_16, tmp_12, /*index=5*/tmp_9, tmp_10), kind=kLoop, calls=fused_computation
-  ROOT custom-call = f32[32,57]{0,1} custom-call(fusion.1, fusion),
+  gemm = (f32[32,57]{0,1}, s8[0]{0}) custom-call(fusion.1, fusion),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = f32[32,57]{0,1} get-tuple-element((f32[32,57]{0,1}, s8[0]{0}) gemm), index=0
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
@@ -2639,13 +2830,14 @@ ENTRY e {
   p1 = s32[11,63]{1,0} parameter(1)
   p0 = bf16[92,11]{1,0} parameter(0)
   fusion = bf16[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  ROOT custom-call = bf16[92,63]{1,0} custom-call(p0, fusion),
+  gemm = (bf16[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$gemm",
     backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":
       {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
       "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
       "alpha_imag":0,"precision_config":
       {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+  ROOT get-tuple-element = bf16[92,63]{1,0} get-tuple-element((bf16[92,63]{1,0}, s8[0]{0}) gemm), index=0
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 966d08f6f72f2a..9568045feb417f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -88,7 +88,6 @@ limitations under the License.
 #include "xla/service/gpu/conditional_thunk.h"
 #include "xla/service/gpu/convolution_thunk.h"
 #include "xla/service/gpu/copy_thunk.h"
-#include "xla/service/gpu/fft_thunk.h"
 #include "xla/service/gpu/for_thunk.h"
 #include "xla/service/gpu/fused_mha_thunk.h"
 #include "xla/service/gpu/fusions/fusions.h"
@@ -116,6 +115,7 @@ limitations under the License.
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/replica_id_thunk.h"
 #include "xla/service/gpu/runtime3/custom_call_thunk.h"
+#include "xla/service/gpu/runtime3/fft_thunk.h"
 #include "xla/service/gpu/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/while_thunk.h"
@@ -147,7 +147,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
 #include "xla/service/gpu/cub_sort_thunk.h"
-#include "xla/service/gpu/cublas_lt_matmul_thunk.h"
+#include "xla/service/gpu/gpublas_lt_matmul_thunk.h"
 #include "xla/service/gpu/ir_emitter_triton.h"
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
@@ -338,10 +338,6 @@ std::unique_ptr<IrEmitterUnnested> IrEmitterUnnested::Create(
 
 StatusOr<BufferAllocation::Slice> IrEmitterUnnested::GetAllocationSlice(
     mlir::Value v) {
-  if (ir_emitter_context_->emit_ir_from_hlo()) {
-    return InternalError(
-        "Getting buffer allocation for MLIR when emitting from HLO");
-  }
   return xla::gpu::GetAllocationSlice(v, ir_emitter_context_->allocations(),
                                       nullptr);
 }
@@ -385,10 +381,14 @@ Status IrEmitterUnnested::EmitConstant(mlir::Operation* op) {
     TF_ASSIGN_OR_RETURN(
         element_bytes, GetElementTypeBytes(literal.getType().getElementType()));
   }
-  ir_emitter_context_->emit_constant(
-      num_elements, element_bytes, global.getSymName(),
-      global->getAttrOfType<mlir::IntegerAttr>("lmhlo.alloc").getInt(), content,
-      &b_);
+
+  int64_t arg_index =
+      global->getAttrOfType<mlir::IntegerAttr>("lmhlo.alloc").getInt();
+  int allocation_index = ir_emitter_context_->allocations()[arg_index]->index();
+
+  ir_emitter_context_->emit_constant(num_elements, element_bytes,
+                                     global.getSymName(), allocation_index,
+                                     content, &b_);
   return OkStatus();
 }
 
@@ -1760,7 +1760,7 @@ static Status ProcessFusionForConversion(mlir::Region* region,
 #if GOOGLE_CUDA
 Status IrEmitterUnnested::EmitTritonFusion(
     const HloFusionAnalysis& hlo_fusion_analysis, mlir::Operation* op,
-    const AutotuneResult::TritonGemmKey& config,
+    const TritonGemmConfig& config,
     const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
         hlo_for_lmhlo) {
   // Note: In this method we can't use `BuildKernelThunk` as usual,
@@ -1815,9 +1815,8 @@ Status IrEmitterUnnested::EmitTritonFusion(
           hlo_fusion_analysis.fusion_boundary(), config);
     } else {  // Must be a MatMul
       CHECK_EQ(fusion_kind, kTritonGemmFusionKind);
-      TF_ASSIGN_OR_RETURN(
-          auto analysis,
-          TritonFusionAnalysis::Execute(*hlo_computation, config.split_k()));
+      TF_ASSIGN_OR_RETURN(auto analysis, TritonFusionAnalysis::Execute(
+                                             *hlo_computation, config.split_k));
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
@@ -1921,18 +1920,20 @@ Status IrEmitterUnnested::EmitFusion(
           triton_config.set_num_stages(1);
           triton_config.set_num_warps(2);
         }
-        return EmitTritonFusion(fusion_analysis, fusion_op,
-                                backend_config.triton_gemm_config(),
-                                hlo_for_lmhlo);
+        return EmitTritonFusion(
+            fusion_analysis, fusion_op,
+            TritonGemmConfig::FromProto(backend_config.triton_gemm_config()),
+            hlo_for_lmhlo);
       }
       if (backend_config.kind() == kTritonSoftmaxFusionKind) {
         auto& triton_config = *backend_config.mutable_triton_gemm_config();
         triton_config.set_num_stages(1);
         triton_config.set_num_warps(
             DeriveNumWarpsFromTritonSoftmaxComputation(fused_computation));
-        return EmitTritonFusion(fusion_analysis, fusion_op,
-                                backend_config.triton_gemm_config(),
-                                hlo_for_lmhlo);
+        return EmitTritonFusion(
+            fusion_analysis, fusion_op,
+            TritonGemmConfig::FromProto(backend_config.triton_gemm_config()),
+            hlo_for_lmhlo);
       }
 #endif
       LOG(FATAL) << "Unsupported fusion kind: " << backend_config.kind();
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index 793a35efb2e5d2..6ff05361d2b47d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -141,7 +141,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitConvolutionReorderThunk(mlir::Operation* op);
   Status EmitTritonFusion(
       const HloFusionAnalysis& hlo_fusion_analysis, mlir::Operation* op,
-      const AutotuneResult::TritonGemmKey& config,
+      const TritonGemmConfig& config,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
   Status EmitFusedMHAThunk(mlir::Operation* op);
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.cc b/third_party/xla/xla/service/gpu/kernel_arguments.cc
index 21fcdaa3545cb7..5fcea6d418a286 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.cc
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.cc
@@ -18,20 +18,25 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
 
 StatusOr<KernelArgument> KernelArgument::Create(
-    absl::Span<const BufferAllocation> allocations, mlir::Value value,
+    absl::Span<const BufferAllocation* const> allocations, mlir::Value value,
     bool is_written) {
   TF_ASSIGN_OR_RETURN(
       auto slice, xla::gpu::GetAllocationSlice(value, allocations, nullptr));
@@ -39,7 +44,7 @@ StatusOr<KernelArgument> KernelArgument::Create(
 }
 
 StatusOr<KernelArguments> KernelArguments::Create(
-    absl::Span<const BufferAllocation> allocations,
+    absl::Span<const BufferAllocation* const> allocations,
     mlir::lmhlo::FusionOp fusion) {
   auto operands = GetHloOperands(fusion);
   auto outputs = GetHloOutputs(fusion);
@@ -67,13 +72,22 @@ StatusOr<KernelArguments> KernelArguments::Create(
   for (const HloInstruction* operand : fusion->operands()) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                         buffer_assignment.GetUniqueSlice(operand, {}));
-    kernel_arguments.emplace_back(
-        KernelArgument(nullptr, operand->shape(), slice, false));
+    kernel_arguments.emplace_back(KernelArgument(
+        /*value=*/nullptr, operand->shape(), slice, /*written=*/false));
   }
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      buffer_assignment.GetUniqueSlice(fusion, {}));
-  kernel_arguments.emplace_back(
-      KernelArgument(nullptr, fusion->shape(), slice, true));
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      fusion->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return OkStatus();
+        }
+        TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                            buffer_assignment.GetUniqueSlice(fusion, index));
+        kernel_arguments.emplace_back(KernelArgument(
+            /*value=*/nullptr, subshape, slice, /*written=*/true));
+        return OkStatus();
+      }));
+
   return KernelArguments{std::move(kernel_arguments)};
 }
 
@@ -137,7 +151,7 @@ std::vector<KernelArgument> KernelArguments::ProcessArguments(
 }
 
 StatusOr<KernelArguments> KernelArguments::Create(
-    absl::Span<const BufferAllocation> allocations,
+    absl::Span<const BufferAllocation* const> allocations,
     mlir::Operation* non_fusion_op, mlir::ValueRange needed_operands) {
   std::vector<KernelArgument> kernel_arguments;
   kernel_arguments.reserve(needed_operands.size());
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.h b/third_party/xla/xla/service/gpu/kernel_arguments.h
index 2b7afa9fe2b357..12926227edec46 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.h
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.h
@@ -34,7 +34,7 @@ namespace gpu {
 class KernelArgument {
  public:
   static StatusOr<KernelArgument> Create(
-      absl::Span<const BufferAllocation> allocations, mlir::Value value,
+      absl::Span<const BufferAllocation* const> allocations, mlir::Value value,
       bool is_written);
 
   mlir::Value value() const { return value_; }
@@ -68,7 +68,7 @@ class KernelArgument {
 class KernelArguments {
  public:
   static StatusOr<KernelArguments> Create(
-      absl::Span<const BufferAllocation> allocations,
+      absl::Span<const BufferAllocation* const> allocations,
       mlir::lmhlo::FusionOp fusion);
 
   static StatusOr<KernelArguments> Create(
@@ -76,7 +76,7 @@ class KernelArguments {
       const HloFusionInstruction* fusion);
 
   static StatusOr<KernelArguments> Create(
-      absl::Span<const BufferAllocation> allocations,
+      absl::Span<const BufferAllocation* const> allocations,
       mlir::Operation* non_fusion_op, mlir::ValueRange needed_operands);
 
   const std::vector<KernelArgument>& args() const { return args_; }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index beacd13fb83bfc..293668ff246ac1 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -114,7 +114,10 @@ static std::string GetSmName(se::CudaComputeCapability compute_capability) {
                  << ". Defaulting to telling LLVM that we're compiling for sm_"
                  << sm_version;
   }
-  return absl::StrCat("sm_", sm_version);
+  // If the target is sm_90, hard code it to sm_90a so that all instructions
+  // can be used. We don't need the portability that sm_90 gives.
+  std::string_view extension = sm_version == 90 ? "a" : "";
+  return absl::StrCat("sm_", sm_version, extension);
 }
 
 // Convenience function for producing a name of a temporary compilation product
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 3462dd252191fb..531a3d7b57c78d 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -25,12 +26,14 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/blas.h"
@@ -290,12 +293,13 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     absl::Span<const int64_t> rhs_batch_dims,
     absl::Span<const int64_t> rhs_contracting_dims, const Shape& output_shape,
     double alpha_real, double alpha_imag, double beta,
-    std::optional<int64_t> algorithm, int64_t compute_precision) {
+    std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
+    bool grad_y) {
   return GemmConfig::For(lhs_shape, lhs_batch_dims, lhs_contracting_dims,
                          rhs_shape, rhs_batch_dims, rhs_contracting_dims,
                          /*c_shape=*/output_shape, /*bias_shape_ptr=*/nullptr,
                          output_shape, alpha_real, alpha_imag, beta, algorithm,
-                         compute_precision);
+                         compute_precision, grad_x, grad_y);
 }
 
 /*static*/ StatusOr<GemmConfig> GemmConfig::For(
@@ -305,7 +309,7 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
     const Shape* bias_shape_ptr, const Shape& output_shape, double alpha_real,
     double alpha_imag, double beta, std::optional<int64_t> algorithm,
-    int64_t compute_precision) {
+    int64_t compute_precision, bool grad_x, bool grad_y) {
   absl::Span<const int64_t> lhs_col_dims = lhs_contracting_dims;
   TF_ASSIGN_OR_RETURN(
       std::vector<int64_t> lhs_row_dims,
@@ -406,16 +410,16 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
                                output_shape.element_type()));
   }
 
-  return GemmConfig{
-      lhs_layout,
-      rhs_layout,
-      c_layout,
-      output_layout,
-      {alpha_real, alpha_imag},
-      beta,
-      compute_precision,
-      algorithm,
-  };
+  return GemmConfig{lhs_layout,
+                    rhs_layout,
+                    c_layout,
+                    output_layout,
+                    {alpha_real, alpha_imag},
+                    beta,
+                    compute_precision,
+                    algorithm,
+                    grad_x,
+                    grad_y};
 }
 
 /*static*/ StatusOr<GemmConfig> GemmConfig::For(const HloInstruction* gemm) {
@@ -433,12 +437,16 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   const Shape& output_shape =
       gemm->shape().IsTuple() ? gemm->shape().tuple_shapes(0) : gemm->shape();
 
+  auto attributes = gemm->frontend_attributes().map();
+  bool grad_x = (attributes["grad_x"] == "true");
+  bool grad_y = (attributes["grad_y"] == "true");
+
   return GemmConfig::For(
       lhs_shape, dot_dims.lhs_batch_dimensions(),
       dot_dims.lhs_contracting_dimensions(), rhs_shape,
       dot_dims.rhs_batch_dimensions(), dot_dims.rhs_contracting_dimensions(),
       output_shape, config.alpha_real(), config.alpha_imag(), config.beta(),
-      algorithm, se::blas::kDefaultComputePrecision);
+      algorithm, se::blas::kDefaultComputePrecision, grad_x, grad_y);
 }
 
 /*static*/ StatusOr<GemmConfig> GemmConfig::For(mlir::lmhlo_gpu::GEMMOp op) {
@@ -447,6 +455,13 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   std::optional<int64_t> algorithm;
   if (op.getAlgorithm()) algorithm = *op.getAlgorithm();
 
+  bool grad_x = false;
+  bool grad_y = false;
+  auto attr_grad_x = op.getGradX();
+  if (attr_grad_x) grad_x = attr_grad_x.value();
+  auto attr_grad_y = op.getGradY();
+  if (attr_grad_y) grad_y = attr_grad_y.value();
+
   int64_t compute_precision = 0;  // Default
   if (op.getPrecisionConfig().has_value()) {
     auto precision_config = op.getPrecisionConfig();
@@ -465,7 +480,8 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
       dot_dims.getRhsBatchingDimensions(),
       dot_dims.getRhsContractingDimensions(), GetShape(op.getC()),
       op.getAlphaReal().convertToDouble(), op.getAlphaImag().convertToDouble(),
-      op.getBeta().convertToDouble(), algorithm, compute_precision);
+      op.getBeta().convertToDouble(), algorithm, compute_precision, grad_x,
+      grad_y);
 }
 
 namespace {
@@ -502,15 +518,14 @@ MatrixDescriptor GetMatrixDesc(const MatrixLayout& layout,
 }
 
 template <typename Scale, typename Input, typename Output>
-Status DoGemmWithAlgorithm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
-                           const MatrixDescriptor& lhs,
-                           const MatrixDescriptor& rhs,
-                           const MatrixDescriptor& output, Scale alpha,
-                           Scale beta, se::Stream* stream,
-                           se::blas::AlgorithmType algorithm,
-                           se::blas::ComputePrecision compute_precision,
-                           const se::NumericOptions& numeric_options,
-                           se::blas::ProfileResult* profile_result) {
+Status DoGemmWithAlgorithm(
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    const MatrixDescriptor& lhs, const MatrixDescriptor& rhs,
+    const MatrixDescriptor& output, se::DeviceMemoryBase workspace, Scale alpha,
+    Scale beta, se::Stream* stream, se::blas::AlgorithmType algorithm,
+    se::blas::ComputePrecision compute_precision,
+    const se::NumericOptions& numeric_options,
+    se::blas::ProfileResult* profile_result, se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
   PrimitiveType lhs_type = primitive_util::NativeToPrimitiveType<Input>();
   PrimitiveType output_type = primitive_util::NativeToPrimitiveType<Output>();
@@ -519,39 +534,49 @@ Status DoGemmWithAlgorithm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
                                                       compute_precision));
   se::DeviceMemory<Output> output_data(output.data);
 
+  // Set a workspace for all Blas operations launched below.
+  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(
+      stream->parent()->AsBlas(), &workspace);
+
   if (batch_size != 1) {
     return stream->ThenBlasGemmStridedBatchedWithAlgorithm(
         lhs.transpose, rhs.transpose, m, n, k, alpha, lhs.cast<Input>(),
         lhs.leading_dim_stride, lhs.batch_stride, rhs.cast<Input>(),
         rhs.leading_dim_stride, rhs.batch_stride, beta, &output_data,
         output.leading_dim_stride, output.batch_stride, batch_size,
-        computation_type, algorithm, numeric_options, profile_result);
+        computation_type, algorithm, numeric_options, profile_result, context);
   } else {
     return stream->ThenBlasGemmWithAlgorithm(
         lhs.transpose, rhs.transpose, m, n, k, alpha, lhs.cast<Input>(),
         lhs.leading_dim_stride, rhs.cast<Input>(), rhs.leading_dim_stride, beta,
         &output_data, output.leading_dim_stride, computation_type, algorithm,
-        numeric_options, profile_result);
+        numeric_options, profile_result, context);
   }
 }
 
 template <typename Scale, typename Input, typename Output>
 Status DoGemm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
               const MatrixDescriptor& lhs, const MatrixDescriptor& rhs,
-              const MatrixDescriptor& output, Scale alpha, Scale beta,
-              se::Stream* stream,
+              const MatrixDescriptor& output, se::DeviceMemoryBase workspace,
+              Scale alpha, Scale beta, se::Stream* stream,
               std::optional<se::blas::AlgorithmType> algorithm,
               se::blas::ComputePrecision compute_precision,
               const se::NumericOptions& numeric_options,
-              se::blas::ProfileResult* profile_result) {
+              se::blas::ProfileResult* profile_result,
+              se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
   se::DeviceMemory<Output> output_data(output.data);
 
+  // Set a workspace for all Blas operations launched below.
+  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(
+      stream->parent()->AsBlas(), &workspace);
+
 #if GOOGLE_CUDA
   if (algorithm) {
     return DoGemmWithAlgorithm<Scale, Input, Output>(
-        batch_size, m, n, k, lhs, rhs, output, alpha, beta, stream, *algorithm,
-        compute_precision, numeric_options, profile_result);
+        batch_size, m, n, k, lhs, rhs, output, workspace, alpha, beta, stream,
+        *algorithm, compute_precision, numeric_options, profile_result,
+        context);
   }
 #endif
 
@@ -561,20 +586,21 @@ Status DoGemm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
         lhs.leading_dim_stride, lhs.batch_stride, rhs.cast<Input>(),
         rhs.leading_dim_stride, rhs.batch_stride, beta, &output_data,
         output.leading_dim_stride, output.batch_stride, batch_size,
-        numeric_options);
+        numeric_options, context);
   }
 
   return stream->ThenBlasGemm(
       lhs.transpose, rhs.transpose, m, n, k, alpha, lhs.cast<Input>(),
       lhs.leading_dim_stride, rhs.cast<Input>(), rhs.leading_dim_stride, beta,
-      &output_data, output.leading_dim_stride, numeric_options);
+      &output_data, output.leading_dim_stride, numeric_options, context);
 }
 
 }  // namespace
 
 Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
                se::DeviceMemoryBase rhs_buffer,
-               se::DeviceMemoryBase output_buffer, bool deterministic_ops,
+               se::DeviceMemoryBase output_buffer,
+               se::DeviceMemoryBase workspace_buffer, bool deterministic_ops,
                se::Stream* stream,
                std::optional<se::blas::AlgorithmType> algorithm,
                se::blas::ProfileResult* profile_result) {
@@ -602,42 +628,64 @@ Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
 
   if (!algorithm) algorithm = config.algorithm;
 
+  se::blas::CallContext context = se::blas::CallContext::kNone;
+  if (config.grad_x) {
+    context = must_swap_operands ? se::blas::CallContext::kBackpropInput2
+                                 : se::blas::CallContext::kBackpropInput1;
+  }
+  if (config.grad_y) {
+    context = must_swap_operands ? se::blas::CallContext::kBackpropInput1
+                                 : se::blas::CallContext::kBackpropInput2;
+  }
+
   std::tuple<PrimitiveType, PrimitiveType, PrimitiveType> operand_types{
       lhs_layout.dtype, rhs_layout.dtype, output_layout.dtype};
 
-#define TYPED_GEMM(SCALENTYPE, ATYPE, BTYPE, CTYPE)                         \
-  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE)) {              \
-    using NativeScaleType =                                                 \
-        primitive_util::PrimitiveTypeToNative<SCALENTYPE>::type;            \
-    using NativeAType = primitive_util::PrimitiveTypeToNative<ATYPE>::type; \
-    using NativeCType = primitive_util::PrimitiveTypeToNative<CTYPE>::type; \
-    return DoGemm<NativeScaleType, NativeAType, NativeCType>(               \
-        batch_size, m, n, k, lhs, rhs, output,                              \
-        static_cast<NativeScaleType>(config.alpha.real()),                  \
-        static_cast<NativeScaleType>(config.beta), stream, algorithm,       \
-        config.compute_precision, numeric_options, profile_result);         \
+  // Skip degenerate gemm with memzero. In general this is not safe, because it
+  // will suppress NaN propagation, however cuBLAS internally has exactly the
+  // same optimization for compatibility with NETLIB implementation, so we are
+  // not making things worse (and cuBLAS optimization is incompatible with CUDA
+  // graphs, so we are making sure we do not trigger it).
+  if (config.alpha.real() == 0.0 && config.alpha.imag() == 0.0 &&
+      config.beta == 0.0) {
+    stream->ThenMemZero(&output_buffer, output_buffer.size());
+    return tsl::OkStatus();
+  }
+
+#define TYPED_GEMM(SCALENTYPE, ATYPE, BTYPE, CTYPE)                          \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE)) {               \
+    using NativeScaleType =                                                  \
+        primitive_util::PrimitiveTypeToNative<SCALENTYPE>::type;             \
+    using NativeAType = primitive_util::PrimitiveTypeToNative<ATYPE>::type;  \
+    using NativeCType = primitive_util::PrimitiveTypeToNative<CTYPE>::type;  \
+    return DoGemm<NativeScaleType, NativeAType, NativeCType>(                \
+        batch_size, m, n, k, lhs, rhs, output, workspace_buffer,             \
+        static_cast<NativeScaleType>(config.alpha.real()),                   \
+        static_cast<NativeScaleType>(config.beta), stream, algorithm,        \
+        config.compute_precision, numeric_options, profile_result, context); \
   }
 
-#define TYPED_GEMM_COMPLEX(SCALENTYPE, ATYPE, BTYPE, CTYPE)                 \
-  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE)) {              \
-    using NativeScaleType =                                                 \
-        primitive_util::PrimitiveTypeToNative<SCALENTYPE>::type;            \
-    using NativeAType = primitive_util::PrimitiveTypeToNative<ATYPE>::type; \
-    using NativeCType = primitive_util::PrimitiveTypeToNative<CTYPE>::type; \
-    return DoGemm<NativeScaleType, NativeAType, NativeCType>(               \
-        batch_size, m, n, k, lhs, rhs, output,                              \
-        static_cast<NativeScaleType>(config.alpha),                         \
-        static_cast<NativeScaleType>(config.beta), stream, algorithm,       \
-        config.compute_precision, numeric_options, profile_result);         \
+#define TYPED_GEMM_COMPLEX(SCALENTYPE, ATYPE, BTYPE, CTYPE)                  \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE)) {               \
+    using NativeScaleType =                                                  \
+        primitive_util::PrimitiveTypeToNative<SCALENTYPE>::type;             \
+    using NativeAType = primitive_util::PrimitiveTypeToNative<ATYPE>::type;  \
+    using NativeCType = primitive_util::PrimitiveTypeToNative<CTYPE>::type;  \
+    return DoGemm<NativeScaleType, NativeAType, NativeCType>(                \
+        batch_size, m, n, k, lhs, rhs, output, workspace_buffer,             \
+        static_cast<NativeScaleType>(config.alpha),                          \
+        static_cast<NativeScaleType>(config.beta), stream, algorithm,        \
+        config.compute_precision, numeric_options, profile_result, context); \
   }
 
   if (output_layout.dtype == S32) {
     if (!algorithm) algorithm = se::blas::kDefaultGemmAlgo;
     return DoGemmWithAlgorithm<int32_t, int8_t, int32_t>(
-        batch_size, m, n, k, lhs, rhs, output,
+        batch_size, m, n, k, lhs, rhs, output, workspace_buffer,
         static_cast<int32_t>(config.alpha.real()),
         static_cast<int32_t>(config.beta), stream, *algorithm,
-        se::blas::kDefaultComputePrecision, numeric_options, profile_result);
+        se::blas::kDefaultComputePrecision, numeric_options, profile_result,
+        context);
   }
 
   TYPED_GEMM(F32, BF16, BF16, BF16)
@@ -657,7 +705,7 @@ Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
       primitive_util::LowercasePrimitiveTypeName(lhs_layout.dtype),
       primitive_util::LowercasePrimitiveTypeName(rhs_layout.dtype),
       primitive_util::LowercasePrimitiveTypeName(output_layout.dtype));
-}
+}  // namespace gpu
 
 namespace gpublas_lt {
 
@@ -720,5 +768,44 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
 
 }  // namespace gpublas_lt
 
+TritonGemmConfig::TritonGemmConfig(int block_m, int block_n, int block_k,
+                                   int split_k, int num_stages, int num_warps)
+    : block_m(block_m),
+      block_n(block_n),
+      block_k(block_k),
+      split_k(split_k),
+      num_stages(num_stages),
+      num_warps(num_warps) {}
+
+/*static*/ TritonGemmConfig TritonGemmConfig::FromProto(
+    const AutotuneResult::TritonGemmKey& proto) {
+  TritonGemmConfig config;
+  config.block_m = proto.block_m();
+  config.block_n = proto.block_n();
+  config.block_k = proto.block_k();
+  config.split_k = proto.split_k();
+  config.num_stages = proto.num_stages();
+  config.num_warps = proto.num_warps();
+  return config;
+}
+
+AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
+  AutotuneResult::TritonGemmKey key;
+  key.set_block_m(block_m);
+  key.set_block_n(block_n);
+  key.set_block_k(block_k);
+  key.set_split_k(split_k);
+  key.set_num_stages(num_stages);
+  key.set_num_warps(num_warps);
+  return key;
+}
+
+std::string TritonGemmConfig::ToString() const {
+  return absl::StrCat("{block_m:", block_m, ",block_n:", block_n,
+                      ",block_k:", block_k, ",split_k:", split_k,
+                      ",num_stages:", num_stages, ",num_warps:", num_warps,
+                      "}");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index b6d5189f88bc68..e6d318df2e825b 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -85,6 +88,14 @@ struct MatrixLayout : public se::gpu::MatrixLayout {
 };
 
 struct GemmConfig : public se::gpu::GemmConfig {
+  // For legacy Gemm operations XLA:GPU allocates its own workspace and passes
+  // it to all BLAS API calls.
+  //
+  // Size of the workspace based on NVIDIA recommendation:
+  // https://docs.nvidia.com/cuda/cublas/#cublassetworkspace
+  static constexpr int64_t kHopperWorkspace = 32 * 1024 * 1024;  // 32 MiB
+  static constexpr int64_t kDefaultWorkspace = 4 * 1024 * 1024;  // 4 MiB
+
   static StatusOr<GemmConfig> For(const HloInstruction* gemm);
   static StatusOr<GemmConfig> For(mlir::lmhlo_gpu::GEMMOp op);
 
@@ -94,7 +105,8 @@ struct GemmConfig : public se::gpu::GemmConfig {
       absl::Span<const int64_t> rhs_batch_dims,
       absl::Span<const int64_t> rhs_contracting_dims, const Shape& output_shape,
       double alpha_real, double alpha_imag, double beta,
-      std::optional<int64_t> algorithm, int64_t compute_precision);
+      std::optional<int64_t> algorithm, int64_t compute_precision, bool grad_x,
+      bool grad_y);
 
   // As above with additional `c_shape` and `bias_shape_ptr` parameter, both
   // which are only necessarily for F8 gemms.
@@ -105,7 +117,7 @@ struct GemmConfig : public se::gpu::GemmConfig {
       absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
       const Shape* bias_shape_ptr, const Shape& output_shape, double alpha_real,
       double alpha_imag, double beta, std::optional<int64_t> algorithm,
-      int64_t compute_precision);
+      int64_t compute_precision, bool grad_x, bool grad_y);
 
   template <typename CublasLtMatmulMaybeF8Op,
             typename = std::enable_if<
@@ -140,7 +152,8 @@ struct GemmConfig : public se::gpu::GemmConfig {
         op.getBias() == nullptr ? nullptr : &bias_shape, GetShape(op.getD()),
         op.getAlphaReal().convertToDouble(),
         op.getAlphaImag().convertToDouble(), op.getBeta().convertToDouble(),
-        op.getAlgorithm(), compute_precision);
+        op.getAlgorithm(), compute_precision, /*grad_x=*/false,
+        /*grad_y=*/false);
   }
 };
 
@@ -150,7 +163,8 @@ struct GemmConfig : public se::gpu::GemmConfig {
 // If `algorithm` is provided, it overrides the one specified in `config`.
 Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
                se::DeviceMemoryBase rhs_buffer,
-               se::DeviceMemoryBase output_buffer, bool deterministic_ops,
+               se::DeviceMemoryBase output_buffer,
+               se::DeviceMemoryBase workspace_buffer, bool deterministic_ops,
                se::Stream* stream,
                std::optional<se::blas::AlgorithmType> algorithm = std::nullopt,
                se::blas::ProfileResult* profile_result = nullptr);
@@ -165,6 +179,46 @@ StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
 
 }  // namespace gpublas_lt
 
+// We should use this in code instead of AutotuneResult::TritonGemmKey.
+// This has some advantages, for example it can be used in hashmaps.
+struct TritonGemmConfig {
+  TritonGemmConfig() = default;
+  TritonGemmConfig(int block_m, int block_n, int block_k, int split_k,
+                   int num_stages, int num_warps);
+
+  int block_m = 0;
+  int block_n = 0;
+  int block_k = 0;
+  int split_k = 0;
+  int num_stages = 0;
+  int num_warps = 0;
+
+ private:
+  auto ToTuple() const {
+    return std::make_tuple(block_m, block_n, block_k, split_k, num_stages,
+                           num_warps);
+  }
+
+ public:
+  static TritonGemmConfig FromProto(const AutotuneResult::TritonGemmKey& proto);
+  AutotuneResult::TritonGemmKey ToProto() const;
+
+  std::string ToString() const;
+
+  bool operator==(const TritonGemmConfig& other) const {
+    return ToTuple() == other.ToTuple();
+  }
+
+  bool operator<(const TritonGemmConfig& other) const {
+    return ToTuple() < other.ToTuple();
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const TritonGemmConfig& config) {
+    return H::combine(std::move(h), config.ToTuple());
+  }
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 1c0a5b20dd84f4..d13c96d435606a 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -59,8 +59,9 @@ LatencyEstimator::TimeCost AnalyticalLatencyEstimator::NodeCost(
   }
 
   absl::Duration total_estimated_time =
-      GpuPerformanceModel::EstimateRunTimeForInstruction(instr,
-                                                         &*cost_analysis_)
+      GpuPerformanceModel::EstimateRunTimeForInstruction(
+          instr, &*cost_analysis_,
+          GpuPerformanceModelOptions::ForModule(instr->GetModule()))
           .exec_time;
   LatencyEstimator::TimeCost cost_in_us =
       absl::ToDoubleMicroseconds(total_estimated_time);
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
index d5acb19f83c20e..14472872c3ab9c 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
@@ -39,8 +39,9 @@ StatusOr<bool> GpuCostModelStatsCollection::Run(
     for (auto* fusion_instr : computation->instructions()) {
       if (fusion_instr->opcode() != HloOpcode::kFusion) continue;
 
-      GpuPerformanceModel::RecordEstimatedRunTime(fusion_instr,
-                                                  &cost_analysis_);
+      GpuPerformanceModel::RecordEstimatedRunTime(
+          fusion_instr, &cost_analysis_,
+          GpuPerformanceModelOptions::ForModule(module));
     }
   }
   return false;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index c4542efce05d33..51579cd34a2cf1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -239,7 +239,8 @@ LaunchDimensions EstimateFusionLaunchDimensions(
 
 /*static*/ EstimateRunTimeData
 GpuPerformanceModel::EstimateRunTimeForInstruction(
-    const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis) {
+    const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config) {
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   int64_t flops = cost_analysis->flop_count(*instr);
@@ -255,7 +256,7 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
   absl::Duration read_time = ProducerInputAccessTime(
       cost_analysis, *device_info, launch_dimensions.num_blocks(),
-      /*producer=*/instr, fusion_analysis);
+      /*producer=*/instr, fusion_analysis, config);
   absl::Duration write_time =
       absl::Seconds(1.0f * bytes_written / device_info->memory_bandwidth());
   absl::Duration exec_time = std::max(compute_time, read_time + write_time);
@@ -281,13 +282,14 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
     const HloInstruction* producer,
     std::optional<HloFusionAnalysis>& fusion_analysis,
+    const GpuPerformanceModelOptions& config,
     const HloInstruction* fused_consumer) {
   absl::Duration ret = absl::ZeroDuration();
   float producer_output_utilization = 1.f;
   ConstHloInstructionSet consumer_operands;
   bool consumer_transposes = false;
   if (fused_consumer) {
-    consumer_transposes = IsPhysicallyTransposing(*fused_consumer);
+    consumer_transposes = TransposesMinorDimension(fused_consumer);
     producer_output_utilization = cost_analysis->operand_utilization(
         *fused_consumer, fused_consumer->operand_index(producer));
     for (const HloInstruction* op : fused_consumer->operands()) {
@@ -295,7 +297,7 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     }
   }
 
-  bool producer_transposes = IsPhysicallyTransposing(*producer);
+  bool producer_transposes = TransposesMinorDimension(producer);
   for (int i = 0; i < producer->operand_count(); ++i) {
     // Information about data read taking into account utilization.
     // If `operand_utilization` is 0, `operand_bytes_accessed` should be also 0.
@@ -357,7 +359,8 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     float n_bytes_total = operand_bytes_accessed *
                           (producer_output_utilization - common_utilization);
     ret += ReadTime(gpu_device_info, num_blocks, /*n_bytes_net=*/n_bytes_net,
-                    n_bytes_total, operand_shape.element_type(), coalesced);
+                    n_bytes_total, operand_shape.element_type(),
+                    coalesced || !config.consider_coalescing);
   }
   return ret;
 }
@@ -375,6 +378,7 @@ absl::Duration GpuPerformanceModel::ComputeTime(
 
 GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config,
     std::vector<HloInstruction*> fused_consumers, bool multi_output) {
   VLOG(8) << "Producer: " << producer->name();
   if (producer->opcode() == HloOpcode::kFusion) {
@@ -384,7 +388,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   EstimateRunTimeData producer_data =
-      EstimateRunTimeForInstruction(producer, cost_analysis);
+      EstimateRunTimeForInstruction(producer, cost_analysis, config);
 
   int64_t fused_consumer_count = fused_consumers.size();
   float total_producer_utilization = 0;
@@ -425,7 +429,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     // don't currently have an analysis that is able to detect these cases.
     absl::Duration input_access_time_by_this_consumer = ProducerInputAccessTime(
         cost_analysis, *device_info, launch_dimensions_fused.num_blocks(),
-        producer, analysis_fused, fused_consumer);
+        producer, analysis_fused, config, fused_consumer);
 
     exec_time_fused += std::max(compute_time_by_this_consumer,
                                 input_access_time_by_this_consumer);
@@ -437,7 +441,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     producer_output_read_time_unfused += ReadTime(
         *device_info, launch_dimensions_unfused.num_blocks(), n_bytes_net,
         n_bytes_total, fused_consumer->shape().element_type(),
-        /*coalesced=*/!IsPhysicallyTransposing(*fused_consumer));
+        /*coalesced=*/!TransposesMinorDimension(fused_consumer));
   }
 
   absl::Duration time_unfused =
@@ -464,12 +468,13 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
 }
 
 void GpuPerformanceModel::RecordEstimatedRunTime(
-    HloInstruction* instruction, const GpuHloCostAnalysis* cost_analysis) {
+    HloInstruction* instruction, const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config) {
   DCHECK(Cast<const HloFusionInstruction>(instruction)) << "expected fusion";
   DCHECK(cost_analysis != nullptr) << "expected cost analysis";
 
   EstimateRunTimeData data =
-      EstimateRunTimeForInstruction(instruction, cost_analysis);
+      EstimateRunTimeForInstruction(instruction, cost_analysis, config);
   double cycles = absl::ToDoubleNanoseconds(data.exec_time) *
                   cost_analysis->device_info_->clock_rate_ghz();
 
@@ -649,7 +654,7 @@ GpuPerformanceWithCollectiveModel::ComputeCollectiveTime(
 
   if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
     VLOG(8) << "Returning 0 cost for async done op " << instr.name();
-    return absl::Microseconds(0);
+    return absl::ZeroDuration();
   }
   switch (instr.opcode()) {
     case HloOpcode::kAllReduce:
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 2223b319b592ee..8bca1d8d784000 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -54,6 +54,27 @@ struct EstimateRunTimeData {
   absl::Duration exec_time;
 };
 
+struct GpuPerformanceModelOptions {
+  // Whether to attempt to model the effect of uncoalesced reads.
+  bool consider_coalescing = false;
+
+  static GpuPerformanceModelOptions Default() {
+    return GpuPerformanceModelOptions();
+  }
+
+  static GpuPerformanceModelOptions PriorityFusion() {
+    GpuPerformanceModelOptions config;
+    config.consider_coalescing = true;
+    return config;
+  }
+
+  static GpuPerformanceModelOptions ForModule(const HloModule* module) {
+    return module->config().debug_options().xla_gpu_enable_priority_fusion()
+               ? PriorityFusion()
+               : Default();
+  }
+};
+
 class GpuPerformanceModel {
  public:
   struct RunTimes {
@@ -62,16 +83,19 @@ class GpuPerformanceModel {
   };
 
   static EstimateRunTimeData EstimateRunTimeForInstruction(
-      const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis);
+      const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config);
 
   static RunTimes EstimateRunTimes(
       const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config,
       std::vector<HloInstruction*> fused_consumers = {},
       bool multi_output = false);
 
   // Writes estimated execution time to FusionBackendConfig.reification_cost.
   static void RecordEstimatedRunTime(HloInstruction* instruction,
-                                     const GpuHloCostAnalysis* cost_analysis);
+                                     const GpuHloCostAnalysis* cost_analysis,
+                                     const GpuPerformanceModelOptions& config);
   static absl::Duration ComputeTime(
       const se::DeviceDescription& gpu_device_info, int64_t flops,
       int64_t num_threads);
@@ -81,6 +105,7 @@ class GpuPerformanceModel {
       const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
       const HloInstruction* producer,
       std::optional<HloFusionAnalysis>& fusion_analysis,
+      const GpuPerformanceModelOptions& config,
       const HloInstruction* fused_consumer = nullptr);
 };
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 2b282d658cbb35..1eed176de8e6dc 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -75,8 +75,8 @@ ENTRY e {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   // Dominated by the DRAM bandwidth.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 57, 10);
 }
@@ -102,12 +102,13 @@ ENTRY e {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(root->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   // Dominated by the kernel launch overhead.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 5, 1);
 
-  GpuPerformanceModel::RecordEstimatedRunTime(root, &analysis_);
+  GpuPerformanceModel::RecordEstimatedRunTime(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   double recorded_cycles = root->backend_config<FusionBackendConfig>()
                                ->reification_cost()
                                .end_to_end_cycles();
@@ -135,12 +136,13 @@ ENTRY e {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(root->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   // Dominated by the DRAM bandwidth.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 175, 30);
 
-  GpuPerformanceModel::RecordEstimatedRunTime(root, &analysis_);
+  GpuPerformanceModel::RecordEstimatedRunTime(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   double recorded_cycles = root->backend_config<FusionBackendConfig>()
                                ->reification_cost()
                                .end_to_end_cycles();
@@ -170,8 +172,8 @@ ENTRY e {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(root->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   // Parameter 0 read is accelerated by L1 cache even though the total data
   // volume is the same as in the test LargeReadWrite above.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 118, 12);
@@ -200,8 +202,8 @@ ENTRY e {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(root->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   // Parameter 0 read is accelerated by L2 cache (does not fit in L1).
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 123, 12);
 }
@@ -233,8 +235,8 @@ TEST_F(GpuPerformanceModelTest, UnusedParameter) {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(root, &analysis_);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      root, &analysis_, GpuPerformanceModelOptions::Default());
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 5, 1);
 }
 
@@ -316,8 +318,8 @@ ENTRY fusion {
     std::vector<HloInstruction*> consumers{
         module->entry_computation()->GetInstructionWithName("reduce.2")};
 
-    return GpuPerformanceModel::EstimateRunTimes(producer, &analysis,
-                                                 consumers);
+    return GpuPerformanceModel::EstimateRunTimes(
+        producer, &analysis, GpuPerformanceModelOptions::Default(), consumers);
   };
 
   TF_ASSERT_OK_AND_ASSIGN(auto large_small_reduce_runtime,
@@ -357,13 +359,45 @@ ENTRY fusion {
       module->entry_computation()->GetInstructionWithName("transpose.1");
   std::vector<HloInstruction*> consumers{
       module->entry_computation()->GetInstructionWithName("reduce.1")};
-  GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimes(producer, &analysis_, consumers);
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      producer, &analysis_, GpuPerformanceModelOptions::PriorityFusion(),
+      consumers);
 
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 105, 10);
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_fused), 1030, 10);
 }
 
+TEST_F(GpuPerformanceModelTest, FusingNonMinorTransposeIntoReduceIsFast) {
+  constexpr absl::string_view kHlo = R"(
+HloModule testmodule
+
+max {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT max = f32[] maximum(p0, p1)
+}
+
+ENTRY fusion {
+  c = f32[] constant(-inf)
+  p0 = f32[1500,32,128]{1,2,0} parameter(0)
+  transpose.1 = f32[1500,128,32]{2,0,1} transpose(p0), dimensions={0,2,1}
+  ROOT reduce.1 = f32[1500,32] reduce(transpose.1, c), dimensions={1}, to_apply=max
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  auto* producer =
+      module->entry_computation()->GetInstructionWithName("transpose.1");
+  std::vector<HloInstruction*> consumers{
+      module->entry_computation()->GetInstructionWithName("reduce.1")};
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      producer, &analysis_, GpuPerformanceModelOptions::Default(), consumers);
+
+  EXPECT_LT(t.time_fused, t.time_unfused);
+}
+
 TEST_F(GpuPerformanceModelTest, DusScalesWithUpdates) {
   constexpr absl::string_view kHlo = R"(
 HloModule testmodule
@@ -410,9 +444,11 @@ ENTRY main {
   ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
   GpuPerformanceModel::RunTimes t1 = GpuPerformanceModel::EstimateRunTimes(
-      module->entry_computation()->root_instruction()->operand(0), &analysis_);
+      module->entry_computation()->root_instruction()->operand(0), &analysis_,
+      GpuPerformanceModelOptions::Default());
   GpuPerformanceModel::RunTimes t2 = GpuPerformanceModel::EstimateRunTimes(
-      module->entry_computation()->root_instruction()->operand(1), &analysis_);
+      module->entry_computation()->root_instruction()->operand(1), &analysis_,
+      GpuPerformanceModelOptions::Default());
 
   // DUS scales with the size of the updates, so these two fusions should have
   // the same cost.
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
index ce3b9ee0943ff0..a88845be62f3c8 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
@@ -217,7 +217,7 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
       [&](const HloInstruction& producer,
           const HloInstruction& consumer) -> FusionDecision {
         GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
-            &producer, cost_analysis,
+            &producer, cost_analysis, GpuPerformanceModelOptions::Default(),
             // `EstimateRunTimes`'s interface violates const correctness, so we
             // need the const cast here.
             {const_cast<HloInstruction*>(&consumer)},
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 6f4646fb89e56f..7f985c47044975 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_asm_opts_util.h"
 #include "xla/service/gpu/gpu_conv_padding_legalization.h"
 #include "xla/service/gpu/gpu_conv_rewriter.h"
+#include "xla/service/gpu/gpu_sort_rewriter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/metrics.h"
@@ -309,6 +310,13 @@ Status NVPTXCompiler::AddTritonGemmAutotuningPasses(
   return OkStatus();
 }
 
+Status NVPTXCompiler::AddCustomKernelReplacementPasses(
+    HloPassPipeline* pipeline, const DebugOptions& debug_options) {
+  if (debug_options.xla_gpu_enable_cub_radix_sort()) {
+    pipeline->AddPass<GpuSortRewriter>();
+  }
+  return OkStatus();
+}
 namespace {
 // Try to load ptx from files defined in the FLAGS. If successful, return true.
 bool MaybeLoadPtxFromFile(const HloModuleConfig module_config,
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index ef801678e33894..b1c4c2b1c46465 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -62,6 +62,9 @@ class NVPTXCompiler : public GpuCompiler {
       AutotuneConfig& autotune_config,
       tsl::thread::ThreadPool* thread_pool) override;
 
+  Status AddCustomKernelReplacementPasses(
+      HloPassPipeline* pipeline, const DebugOptions& debug_options) override;
+
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const override;
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 2f69db5e6031d8..dfe1636d8cdf18 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -242,8 +242,9 @@ class GpuPriorityFusionQueue : public FusionQueue {
     }
 
     GpuPerformanceModel::RunTimes run_times =
-        GpuPerformanceModel::EstimateRunTimes(producer, &cost_analysis_,
-                                              producer->users());
+        GpuPerformanceModel::EstimateRunTimes(
+            producer, &cost_analysis_,
+            GpuPerformanceModelOptions::PriorityFusion(), producer->users());
     return absl::ToInt64Nanoseconds(run_times.time_unfused -
                                     run_times.time_fused);
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 2b860329460001..4e62b73341fb5f 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -191,8 +191,9 @@ cc_library(
     name = "cub_sort",
     srcs = ["cub_sort.cc"],
     hdrs = ["cub_sort.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -255,12 +256,12 @@ cc_library(
         ":conv",
         ":conv_reorder",
         ":cub_sort",
-        ":cublas_lt_matmul",
         ":custom_call",
         ":custom_call_registry",
         ":fft",
         ":fused_attention",
         ":gemm",
+        ":gpublas_lt_matmul",
         ":graph_launch",
         ":io_feed",
         ":kernel_launch",
@@ -301,7 +302,7 @@ cc_library(
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
         "//xla/runtime:state",
-        "//xla/service/gpu:fft_thunk",
+        "//xla/service/gpu/runtime3:fft_thunk",
         "//xla/stream_executor:fft",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
     ],
@@ -457,6 +458,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
     ] + if_cuda_is_configured([
         "//xla/service/gpu:gemm_algorithm_picker",
         "//xla/stream_executor/gpu:redzone_allocator",
@@ -480,6 +482,7 @@ cc_library(
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
         "//xla/service:executable",
+        "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_graph",
@@ -488,6 +491,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
@@ -569,9 +573,9 @@ cc_library(
 )
 
 cc_library(
-    name = "cublas_lt_matmul",
-    srcs = ["cublas_lt_matmul.cc"],
-    hdrs = ["cublas_lt_matmul.h"],
+    name = "gpublas_lt_matmul",
+    srcs = ["gpublas_lt_matmul.cc"],
+    hdrs = ["gpublas_lt_matmul.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
     visibility = ["//visibility:public"],
     deps = [
@@ -586,7 +590,6 @@ cc_library(
         "//xla/service:executable",
         "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor",
-        "//xla/stream_executor/cuda:cublas_lt_header",
         "@local_tsl//tsl/platform:status",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
index 27bdd603276402..799f8370e5e6b2 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ b/third_party/xla/xla/service/gpu/runtime/executable.cc
@@ -33,12 +33,12 @@ limitations under the License.
 #include "xla/service/gpu/runtime/conv.h"
 #include "xla/service/gpu/runtime/conv_reorder.h"
 #include "xla/service/gpu/runtime/cub_sort.h"
-#include "xla/service/gpu/runtime/cublas_lt_matmul.h"
 #include "xla/service/gpu/runtime/custom_call.h"
 #include "xla/service/gpu/runtime/custom_call_registry.h"
 #include "xla/service/gpu/runtime/fft.h"
 #include "xla/service/gpu/runtime/fused_attention.h"
 #include "xla/service/gpu/runtime/gemm.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
 #include "xla/service/gpu/runtime/graph_launch.h"
 #include "xla/service/gpu/runtime/io_feed.h"
 #include "xla/service/gpu/runtime/memcpy.h"
@@ -146,10 +146,12 @@ static int64_t GetNumGraphs(const runtime::Executable& executable) {
 
 GpuRuntimeExecutable::GpuRuntimeExecutable(
     std::string module_name, std::vector<int64_t> buffer_sizes,
+    std::vector<std::vector<int64_t>> allocation_indices,
     std::unique_ptr<JitExecutable> jit_executable, DebugOptions debug_options,
     ModulesState modules_state)
     : module_name_(std::move(module_name)),
       buffer_sizes_(std::move(buffer_sizes)),
+      allocation_indices_(std::move(allocation_indices)),
       executable_(std::move(jit_executable)),
       debug_options_(std::move(debug_options)),
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -161,10 +163,12 @@ GpuRuntimeExecutable::GpuRuntimeExecutable(
 
 GpuRuntimeExecutable::GpuRuntimeExecutable(
     std::string module_name, std::vector<int64_t> buffer_sizes,
+    std::vector<std::vector<int64_t>> allocation_indices,
     std::unique_ptr<Executable> aot_executable, DebugOptions debug_options,
     ModulesState modules_state)
     : module_name_(std::move(module_name)),
       buffer_sizes_(std::move(buffer_sizes)),
+      allocation_indices_(std::move(allocation_indices)),
       executable_(std::move(aot_executable)),
       debug_options_(std::move(debug_options)),
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -231,6 +235,7 @@ GpuRuntimeExecutable::Create(std::string module_name,
 
   return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
       std::move(module_name), std::move(program->buffer_sizes),
+      std::move(program->allocation_indices),
       std::make_unique<JitExecutable>(std::move(*jit_executable)),
       std::move(program->debug_options), std::move(*modules_state)));
 }
@@ -240,10 +245,10 @@ GpuRuntimeExecutable::Create(std::string module_name,
 //===----------------------------------------------------------------------===//
 
 /*static*/ StatusOr<std::unique_ptr<GpuRuntimeExecutable>>
-GpuRuntimeExecutable::Create(std::string module_name,
-                             absl::Span<const int64_t> buffer_sizes,
-                             Executable executable,
-                             DebugOptions debug_options) {
+GpuRuntimeExecutable::Create(
+    std::string module_name, std::vector<int64_t> buffer_sizes,
+    std::vector<std::vector<int64_t>> allocation_indices, Executable executable,
+    DebugOptions debug_options) {
   // Instantiate state for all registered runtime modules.
   auto modules_state = ModulesState::Instantiate();
   if (!modules_state.ok())
@@ -251,8 +256,8 @@ GpuRuntimeExecutable::Create(std::string module_name,
                          modules_state.status().message());
 
   return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
-      std::move(module_name),
-      std::vector<int64_t>(buffer_sizes.begin(), buffer_sizes.end()),
+      std::move(module_name), std::move(buffer_sizes),
+      std::move(allocation_indices),
       std::make_unique<Executable>(std::move(executable)),
       std::move(debug_options), std::move(*modules_state)));
 }
@@ -388,8 +393,6 @@ Status GpuRuntimeExecutable::Execute(
       executor_graphs->snapshot();
   CapturedFunctionExecutionCount::Snapshot execution_count =
       captured_function_counts_(executor)->snapshot();
-  OrdinalToFallback::Snapshot ordinal_to_fallback =
-      ordinal_to_fallback_.snapshot();
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Kernels in concurrent regions should be launched on borrowed stream, so
@@ -403,7 +406,7 @@ Status GpuRuntimeExecutable::Execute(
   FftPlans::Snapshot fft_plans = fft_plans_.snapshot();
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
-  MatmulPlans::Snapshot matmul_plans = cublas_lt_matmul_plans_.snapshot();
+  MatmulPlans::Snapshot matmul_plans = gpublas_lt_matmul_plans_.snapshot();
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -428,7 +431,7 @@ Status GpuRuntimeExecutable::Execute(
       &fused_attention_runners, &fused_attention_backward_runners,
 #endif  // GOOGLE_CUDA
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      &graph_instances, &execution_count, &ordinal_to_fallback,
+      &graph_instances, &execution_count,
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       &concurrent_region_status,
       // Null pointer will be interpreted as an absence of async collectives
@@ -447,25 +450,9 @@ Status GpuRuntimeExecutable::Execute(
   // Instantiate all CUDA graphs before executing the main function.
   if (debug_options_.xla_gpu_graph_num_runs_to_instantiate() < 0 &&
       !graph_instances_.InstantiatedAllGraphs(run_options, executable)) {
-    // To instantiate all Gpu graphs we have to pass a valid device pointer
-    // because some device operations in XLA (e.g. memcpy) query device
-    // information from a pointer. We have to find the largest allocation
-    // available, to guarantee that all memref slices are within bounds,
-    // otherwise we might get crashes from a Gpu driver.
-    void* device_ptr = temp_buffer.opaque();
-    size_t device_ptr_size = temp_buffer.size();
-
-    for (unsigned i = 0; i < buffer_allocations.size(); ++i) {
-      auto mem = buffer_allocations.GetDeviceAddress(i);
-      if (mem.size() > device_ptr_size) {
-        device_ptr = mem.opaque();
-        device_ptr_size = mem.size();
-      }
-    }
-
     if (auto instantiated = graph_instances_.InstantiateAllGraphs(
-            run_options, executable, user_data, device_ptr,
-            &ordinal_to_fallback,
+            run_options, executable, user_data, buffer_allocations,
+            buffer_sizes_, allocation_indices_,
             debug_options_.xla_gpu_graph_eviction_timeout_seconds());
         !instantiated.ok()) {
       return InternalError("Failed to instantiate GPU graphs: %s",
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.h b/third_party/xla/xla/service/gpu/runtime/executable.h
index 5b4744fe2297cf..d8f8018a642374 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.h
+++ b/third_party/xla/xla/service/gpu/runtime/executable.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_RUNTIME_EXECUTABLE_H_
 #define XLA_SERVICE_GPU_RUNTIME_EXECUTABLE_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -30,10 +31,10 @@ limitations under the License.
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/collectives.h"
 #include "xla/service/gpu/runtime/conv.h"
-#include "xla/service/gpu/runtime/cublas_lt_matmul.h"
 #include "xla/service/gpu/runtime/fft.h"
 #include "xla/service/gpu/runtime/fused_attention.h"
 #include "xla/service/gpu/runtime/gemm.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
 #include "xla/service/gpu/runtime/graph_launch.h"
 #include "xla/service/gpu/runtime/kernel_launch.h"
 #include "xla/service/service_executable_run_options.h"
@@ -64,15 +65,18 @@ void RegisterXlaGpuAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
 struct GpuRuntimeProgram {
   GpuRuntimeProgram(std::string entry_point, std::string module,
                     std::vector<int64_t> buffer_sizes,
+                    std::vector<std::vector<int64_t>> allocation_indices,
                     DebugOptions debug_options)
       : entry_point(std::move(entry_point)),
         module(std::move(module)),
         buffer_sizes(std::move(buffer_sizes)),
+        allocation_indices(std::move(allocation_indices)),
         debug_options(std::move(debug_options)) {}
 
   std::string entry_point;
   std::string module;
   std::vector<int64_t> buffer_sizes;
+  std::vector<std::vector<int64_t>> allocation_indices;
   DebugOptions debug_options;
 };
 
@@ -96,7 +100,8 @@ class GpuRuntimeExecutable {
 
   // Creates GpuRuntimeExecutable from the AOT compiled binary.
   static StatusOr<std::unique_ptr<GpuRuntimeExecutable>> Create(
-      std::string module_name, absl::Span<const int64_t> buffer_sizes,
+      std::string module_name, std::vector<int64_t> buffer_sizes,
+      std::vector<std::vector<int64_t>> allocation_indices,
       runtime::Executable executable, DebugOptions debug_options);
 
   // Executes entry function with the given buffer arguments.
@@ -119,11 +124,13 @@ class GpuRuntimeExecutable {
  private:
   GpuRuntimeExecutable(std::string module_name,
                        std::vector<int64_t> buffer_sizes,
+                       std::vector<std::vector<int64_t>> allocation_indices,
                        std::unique_ptr<runtime::JitExecutable> jit_executable,
                        DebugOptions debug_options, ModulesState modules_state);
 
   GpuRuntimeExecutable(std::string module_name,
                        std::vector<int64_t> buffer_sizes,
+                       std::vector<std::vector<int64_t>> allocation_indices,
                        std::unique_ptr<runtime::Executable> aot_executable,
                        DebugOptions debug_options, ModulesState modules_state);
 
@@ -135,6 +142,10 @@ class GpuRuntimeExecutable {
 
   std::vector<int64_t> buffer_sizes_;
 
+  // `rt.allocation_index` attributes for all exported functions. Indexed by
+  // function ordinal.
+  std::vector<std::vector<int64_t>> allocation_indices_;
+
   // In JIT compilation mode `JitExecutable` is used. In AOT compilation mode
   // `Executable` is used.
   std::variant<std::unique_ptr<runtime::JitExecutable>,
@@ -168,14 +179,13 @@ class GpuRuntimeExecutable {
   FftPlans fft_plans_;
 
 #if GOOGLE_CUDA || TF_HIPBLASLT  // Keep matmul execution plans.
-  MatmulPlans cublas_lt_matmul_plans_;
+  MatmulPlans gpublas_lt_matmul_plans_;
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // Keep captured and instantiated GPU graphs instances.
   GraphInstances graph_instances_;
   CapturedFunctionExecutionCounts captured_function_counts_;
-  OrdinalToFallback ordinal_to_fallback_;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Keep an executable state for all registered runtime modules.
diff --git a/third_party/xla/xla/service/gpu/runtime/fft.cc b/third_party/xla/xla/service/gpu/runtime/fft.cc
index fda8f01e5ed1bb..a668a8e1ff8476 100644
--- a/third_party/xla/xla/service/gpu/runtime/fft.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fft.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/executable.h"
 #include "xla/runtime/state.h"
-#include "xla/service/gpu/fft_thunk.h"
 #include "xla/service/gpu/runtime/support.h"
+#include "xla/service/gpu/runtime3/fft_thunk.h"
 #include "xla/stream_executor/fft.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/fft.h b/third_party/xla/xla/service/gpu/runtime/fft.h
index bc45e310272dc7..7a34e2d1db4e62 100644
--- a/third_party/xla/xla/service/gpu/runtime/fft.h
+++ b/third_party/xla/xla/service/gpu/runtime/fft.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/fft_thunk.h"
+#include "xla/service/gpu/runtime3/fft_thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm.cc b/third_party/xla/xla/service/gpu/runtime/gemm.cc
index 29edea4828f39a..a57e43ed8a8a69 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gemm.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/gemm.h"
 
+#include <cstdint>
 #include <limits>
 #include <optional>
 #include <utility>
@@ -33,6 +34,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla.pb.h"
+#include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA
 #include "xla/service/gpu/gemm_algorithm_picker.h"
@@ -48,6 +50,7 @@ using xla::runtime::StridedMemrefView;
 
 #if GOOGLE_CUDA
 
+// TODO(ezhulenev): Delete run time auto tuning from XLA.
 Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
                            se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs,
                            se::DeviceMemoryBase out, const Shape& output_shape,
@@ -92,8 +95,9 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
             // we pass a non-null ProfileResult, DoGemmWithAlgorithm should
             // always return true, and the actual success-ness is returned in
             // ProfileResult::is_valid.
-            TF_RETURN_IF_ERROR(RunGemm(config, lhs, rhs, out, deterministic_ops,
-                                       stream, algorithm, &profile_result));
+            TF_RETURN_IF_ERROR(
+                RunGemm(config, lhs, rhs, out, se::DeviceMemoryBase(nullptr, 0),
+                        deterministic_ops, stream, algorithm, &profile_result));
             return std::move(profile_result);
           }));
 
@@ -111,13 +115,14 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
                              NonAtomicallyUpgradeableRWLock* gpu_lock,
                              State<GemmConfig> state, StridedMemrefView lhs,
                              StridedMemrefView rhs, StridedMemrefView out,
-                             int64_t algorithm, double alpha_real,
-                             double alpha_imag, double beta,
+                             StridedMemrefView workspace, int64_t algorithm,
+                             double alpha_real, double alpha_imag, double beta,
                              DotDimensionNumbers dot_dims,
                              absl::Span<const int32_t> precision) {
   se::DeviceMemoryBase lhs_data = GetDeviceAddress(lhs);
   se::DeviceMemoryBase rhs_data = GetDeviceAddress(rhs);
   se::DeviceMemoryBase output_data = GetDeviceAddress(out);
+  se::DeviceMemoryBase workspace_data = GetDeviceAddress(workspace);
   const bool deterministic_ops = debug_options->xla_gpu_deterministic_ops();
 
   VLOG(3) << "Running GEMM";
@@ -152,7 +157,7 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
 #endif
   }
 
-  return RunGemm(*gemm_config, lhs_data, rhs_data, output_data,
+  return RunGemm(*gemm_config, lhs_data, rhs_data, output_data, workspace_data,
                  deterministic_ops, stream);
 }
 
@@ -177,6 +182,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Arg<StridedMemrefView>()  // lhs
         .Arg<StridedMemrefView>()  // rhs
         .Arg<StridedMemrefView>()  // out
+        .Arg<StridedMemrefView>()  // workspace
         .Attr<int64_t>("algorithm")
         .Attr<double>("alpha_real")
         .Attr<double>("alpha_imag")
diff --git a/third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.cc
rename to third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
index 0fa83517dc218f..c008bf5c3ffdd4 100644
--- a/third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.1
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/cublas_lt_matmul.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
 
 #include <optional>
 #include <string>
diff --git a/third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h
similarity index 90%
rename from third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.h
rename to third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h
index e1c1f12f12d9e5..be85ea6e867397 100644
--- a/third_party/xla/xla/service/gpu/runtime/cublas_lt_matmul.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
-#define XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
+#define XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
 
 #include "xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "xla/runtime/custom_call_registry.h"
@@ -43,4 +43,4 @@ class MatmulPlans
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
index 534d7780c64160..30b6e40cc6355b 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
@@ -31,8 +31,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/executable.h"
+#include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/concurrent_region.h"
 #include "xla/service/gpu/runtime/conv.h"
@@ -270,7 +272,9 @@ bool GraphInstances::InstantiatedAllGraphs(
 Status GraphInstances::InstantiateAllGraphs(
     const ServiceExecutableRunOptions* run_options,
     const Executable& executable, const CustomCall::UserData& user_data,
-    void* ptr, OrdinalToFallback::Snapshot* ordinal_to_fallback,
+    const BufferAllocations& buffer_allocations,
+    absl::Span<const int64_t> buffer_sizes,
+    absl::Span<const std::vector<int64_t>> allocation_indices,
     std::optional<uint64_t> eviction_timeout_seconds) {
   // We have only "main" function in the executable.
   if (executable.num_functions() == 1) return OkStatus();
@@ -303,9 +307,6 @@ Status GraphInstances::InstantiateAllGraphs(
                           "xla.gpu.graph.capture"))
       continue;
 
-    StatusOr<std::monostate*> fallback = ordinal_to_fallback->Get(ordinal);
-    if (fallback.ok()) continue;
-
     VLOG(3) << "Instantiate Gpu graph defined by capture function @"
             << executable.function_name(ordinal) << " (ordinal = " << ordinal
             << ")";
@@ -320,6 +321,12 @@ Status GraphInstances::InstantiateAllGraphs(
     assert(signature.num_results() == 0 && "unexpected number of results");
     Arguments<MemrefDesc> args(signature.num_operands());
 
+    // Mapping from graph capture argument to buffer allocation index.
+    absl::Span<const int64_t> capture_allocs = allocation_indices[ordinal];
+    if (capture_allocs.size() != signature.num_operands())
+      return absl::InternalError(
+          "Invalid number of allocation indices for a graph capture function");
+
     // Prepare arguments for the graph capture function.
     for (size_t j = 0; j < signature.num_operands(); ++j) {
       auto* memref = llvm::dyn_cast<MemrefType>(signature.operand(j));
@@ -336,8 +343,11 @@ Status GraphInstances::InstantiateAllGraphs(
       std::array<int64_t, 1> sizes = {memref->size(0)};
       std::array<int64_t, 1> strides = {1};
 
-      args.emplace_back<MemrefDesc>(memref->element_type(), ptr,
-                                    /*offset=*/0, sizes, strides);
+      int64_t allocation_index = capture_allocs[j];
+      args.emplace_back<MemrefDesc>(
+          memref->element_type(),
+          buffer_allocations.GetDeviceAddress(allocation_index).opaque(),
+          /*offset=*/0, sizes, strides);
     }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -557,7 +567,6 @@ static absl::Status LaunchGraph(
     StreamExecutorConvRunners::Snapshot* convs,
     StreamExecutorGraphInstances::Snapshot* instances,
     CapturedFunctionExecutionCount::Snapshot* counts,
-    OrdinalToFallback::Snapshot* ordinal_to_fallback,
     GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
     NonAtomicallyUpgradeableRWLock* gpu_lock,
     ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
@@ -591,10 +600,7 @@ static absl::Status LaunchGraph(
   // work around disable graph execution and run everything in op-by-op mode.
   bool is_profiling = tsl::profiler::ProfilerLock::HasActiveSession();
 
-  StatusOr<std::monostate*> fallback =
-      ordinal_to_fallback->Get(capture.ordinal);
-
-  if (count < num_runs_to_instantiate || is_profiling || fallback.ok()) {
+  if (count < num_runs_to_instantiate || is_profiling) {
     VLOG(3) << "Run gpu graph in op-by-op mode: ordinal = " << capture.ordinal;
     return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
   }
@@ -694,7 +700,6 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<StreamExecutorConvRunners::Snapshot*>()
         .UserData<StreamExecutorGraphInstances::Snapshot*>()
         .UserData<CapturedFunctionExecutionCount::Snapshot*>()
-        .UserData<OrdinalToFallback::Snapshot*>()
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
index e4841541bc781e..6fe5145098dd6c 100644
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
 
 #include <atomic>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,8 +25,10 @@ limitations under the License.
 #include <variant>
 
 #include "absl/container/node_hash_map.h"
+#include "absl/types/span.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -48,10 +51,6 @@ class StreamExecutorGraphInstances;  // Forward declare
 class CapturedFunctionExecutionCount
     : public runtime::StateVector<std::unique_ptr<std::atomic<uint64_t>>> {};
 
-// Create the i-th value if the capture function with ordinal i causes graph
-// update failure.
-class OrdinalToFallback : public runtime::StateVector<std::monostate> {};
-
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // A state vector that owns all instantiated GPU graphs. Graph capture function
@@ -110,8 +109,10 @@ class GraphInstances {
   Status InstantiateAllGraphs(
       const ServiceExecutableRunOptions* run_options,
       const runtime::Executable& executable,
-      const runtime::CustomCall::UserData& user_data, void* ptr,
-      OrdinalToFallback::Snapshot* ordinal_to_fallback,
+      const runtime::CustomCall::UserData& user_data,
+      const BufferAllocations& buffer_allocations,
+      absl::Span<const int64_t> buffer_sizes,
+      absl::Span<const std::vector<int64_t>> allocation_indices,
       std::optional<uint64_t> eviction_timeout_seconds = std::nullopt);
 
   // Returns true if all Gpu graphs were already instantiated.
diff --git a/third_party/xla/xla/service/gpu/runtime/support.h b/third_party/xla/xla/service/gpu/runtime/support.h
index 743252e874c89a..98cefce43a7a84 100644
--- a/third_party/xla/xla/service/gpu/runtime/support.h
+++ b/third_party/xla/xla/service/gpu/runtime/support.h
@@ -104,7 +104,8 @@ inline StatusOr<GemmConfig> GetGemmConfig(
     absl::Span<const int64_t> lhs_contract, absl::Span<const int64_t> rhs_batch,
     absl::Span<const int64_t> rhs_contract, int64_t compute_precision,
     const std::optional<runtime::StridedMemrefView> c = std::nullopt,
-    const std::optional<runtime::StridedMemrefView>& bias = std::nullopt) {
+    const std::optional<runtime::StridedMemrefView>& bias = std::nullopt,
+    bool grad_x = false, bool grad_y = false) {
   Shape c_shape = ToShape(c.value_or(out));
   Shape bias_shape;
   Shape* bias_shape_ptr = nullptr;
@@ -115,7 +116,7 @@ inline StatusOr<GemmConfig> GetGemmConfig(
   return GemmConfig::For(ToShape(lhs), lhs_batch, lhs_contract, ToShape(rhs),
                          rhs_batch, rhs_contract, c_shape, bias_shape_ptr,
                          ToShape(out), alpha_real, alpha_imag, beta, algorithm,
-                         compute_precision);
+                         compute_precision, grad_x, grad_y);
 }
 
 // adds Dot Dimension Attribute encodings for calls to Gemm and cuBLASLt
diff --git a/third_party/xla/xla/service/gpu/runtime3/BUILD b/third_party/xla/xla/service/gpu/runtime3/BUILD
index 0ff859f3f4b04c..e3ab3493aa7564 100644
--- a/third_party/xla/xla/service/gpu/runtime3/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime3/BUILD
@@ -158,6 +158,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fft_thunk",
+    srcs = ["fft_thunk.cc"],
+    hdrs = ["fft_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
 cc_library(
     name = "triangular_solve_thunk",
     srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
diff --git a/third_party/xla/xla/service/gpu/fft_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/fft_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc
index 9198d43fdb7e8d..711ae990769c53 100644
--- a/third_party/xla/xla/service/gpu/fft_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/fft_thunk.h"
+#include "xla/service/gpu/runtime3/fft_thunk.h"
 
 #include <string>
 
diff --git a/third_party/xla/xla/service/gpu/fft_thunk.h b/third_party/xla/xla/service/gpu/runtime3/fft_thunk.h
similarity index 95%
rename from third_party/xla/xla/service/gpu/fft_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/fft_thunk.h
index 6b2224a7b13f01..4e0de39e1ad4ac 100644
--- a/third_party/xla/xla/service/gpu/fft_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/fft_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_FFT_THUNK_H_
-#define XLA_SERVICE_GPU_FFT_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
 
 #include <optional>
 
@@ -97,4 +97,4 @@ Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_FFT_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 0a8b4f9b47e2d2..a6e28c36980801 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -110,11 +110,11 @@ Status UncompilableMatmul(absl::string_view explanation) {
 
 StatusOr<HloInstruction*> MakeSplitKOperand(
     HloInstruction& dot, const TritonFusionAnalysis& analysis,
-    const AutotuneResult::TritonGemmKey& tiling,
-    const int64_t contracting_dim_idx, const int operand_number) {
+    const TritonGemmConfig& config, const int64_t contracting_dim_idx,
+    const int operand_number) {
   HloInstruction* operand = dot.mutable_operand(operand_number);
   const int64_t k = operand->shape().dimensions(contracting_dim_idx);
-  const bool need_padding = k % tiling.split_k() != 0;
+  const bool need_padding = k % config.split_k != 0;
 
   TritonFusionAnalysis::Scope scope = (operand_number == 0)
                                           ? TritonFusionAnalysis::Scope::LHS
@@ -136,10 +136,10 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
           "Sliced contracting dimension is not supported yet.");
     }
     if (check_divisibility && !HasDivisibleSuffixAllowingSplit(
-                                  fragment.subfragments, tiling.split_k())) {
+                                  fragment.subfragments, config.split_k)) {
       return UncompilableMatmul("Contracting dimension is too fragmented.");
     }
-    if (tiling.split_k() > ceil(1.0 * fragment.count / tiling.block_k())) {
+    if (config.split_k > ceil(1.0 * fragment.count / config.block_k)) {
       return UncompilableMatmul(
           "Too small divisible part of the contracting dimension.");
     }
@@ -169,11 +169,11 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
 
     PaddingConfig padding_config = MakeNoPaddingConfig(operand->shape().rank());
     padding_config.mutable_dimensions(contracting_dim_idx)
-        ->set_edge_padding_high(tiling.split_k() - k % tiling.split_k());
+        ->set_edge_padding_high(config.split_k - k % config.split_k);
 
     TF_ASSIGN_OR_RETURN(operand, MakePadHlo(operand, zero, padding_config));
   }
-  CHECK_GE(operand->shape().dimensions(contracting_dim_idx), tiling.split_k());
+  CHECK_GE(operand->shape().dimensions(contracting_dim_idx), config.split_k);
 
   // Add bitcast.
   const Shape& shape = operand->shape();
@@ -182,8 +182,8 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
   for (int i = 0; i < shape.rank(); ++i) {
     const int64_t dimension_size = shape.dimensions(i);
     if (i == contracting_dim_idx) {
-      new_shape.add_dimensions(tiling.split_k());
-      new_shape.add_dimensions(dimension_size / tiling.split_k());
+      new_shape.add_dimensions(config.split_k);
+      new_shape.add_dimensions(dimension_size / config.split_k);
     } else {
       new_shape.add_dimensions(dimension_size);
     }
@@ -206,11 +206,12 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
   return MakeBitcastHlo(operand, new_shape);
 }
 
-// Apply split K configuration from the tiling to the fused dot() computation:
-// bitcast the operands, change the output shape and the dot dimensions.
-Status MakeDotComputationSplitKBatch(
-    HloComputation* computation, const AutotuneResult::TritonGemmKey& tiling,
-    bool disable_reduced_precision_reduction) {
+// Apply split K configuration from the tiling config to the fused dot()
+// computation: bitcast the operands, change the output shape and the dot
+// dimensions.
+Status MakeDotComputationSplitKBatch(HloComputation* computation,
+                                     const TritonGemmConfig& config,
+                                     bool disable_reduced_precision_reduction) {
   HloInstruction* dot =
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
   TF_ASSIGN_OR_RETURN(const auto analysis,
@@ -268,10 +269,10 @@ Status MakeDotComputationSplitKBatch(
     if (current == dot) {
       TF_ASSIGN_OR_RETURN(
           HloInstruction * lhs,
-          MakeSplitKOperand(*dot, analysis, tiling, lhs_contracting_idx, 0));
+          MakeSplitKOperand(*dot, analysis, config, lhs_contracting_idx, 0));
       TF_ASSIGN_OR_RETURN(
           HloInstruction * rhs,
-          MakeSplitKOperand(*dot, analysis, tiling, rhs_contracting_idx, 1));
+          MakeSplitKOperand(*dot, analysis, config, rhs_contracting_idx, 1));
       if (lhs->operand(0)->opcode() == HloOpcode::kPad) {
         CHECK_EQ(rhs->operand(0)->opcode(), HloOpcode::kPad);
         did_pad = true;
@@ -290,9 +291,8 @@ Status MakeDotComputationSplitKBatch(
       expanded->mutable_shape()->mutable_layout()->add_minor_to_major(0);
       dot->SetupDerivedInstruction(expanded);
     } else {
-      expanded = computation->AddInstruction(
-          current->CloneWithNewShape(ShapeUtil::PrependMajorDimension(
-              tiling.split_k(), current->shape())));
+      expanded = computation->AddInstruction(current->CloneWithNewShape(
+          ShapeUtil::PrependMajorDimension(config.split_k, current->shape())));
       if (expanded->opcode() == HloOpcode::kTranspose) {
         const auto* old_transpose = Cast<HloTransposeInstruction>(current);
         auto* new_transpose = Cast<HloTransposeInstruction>(expanded);
@@ -320,7 +320,7 @@ Status MakeDotComputationSplitKBatch(
         TF_RETURN_IF_ERROR(expanded->ReplaceOperandWithDifferentShape(
             i, MakeBroadcastHlo(operand, broadcast_dimensions,
                                 ShapeUtil::PrependMajorDimension(
-                                    tiling.split_k(), operand->shape()))));
+                                    config.split_k, operand->shape()))));
       }
     }
   }
@@ -342,14 +342,14 @@ Status MakeDotComputationSplitKBatch(
     // For the case without padding, we already checked this in
     // MakeSplitKOperand with the divisibility check.
     TF_RETURN_IF_ERROR(
-        TritonFusionAnalysis::Execute(*computation, tiling.split_k()).status());
+        TritonFusionAnalysis::Execute(*computation, config.split_k).status());
   }
 
   return OkStatus();
 }
 
 Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
-                          const AutotuneResult::TritonGemmKey& tiling) {
+                          const TritonGemmConfig& config) {
   CHECK_EQ(dot_fusion->opcode(), HloOpcode::kFusion);
 
   if (dot_fusion->shape().IsTuple()) {
@@ -365,7 +365,7 @@ Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
   const Layout output_layout = dot_fusion->shape().layout();
 
   TF_RETURN_IF_ERROR(MakeDotComputationSplitKBatch(
-      dot_fusion->fused_instructions_computation(), tiling,
+      dot_fusion->fused_instructions_computation(), config,
       disable_reduced_precision_reduction));
   const HloInstruction* root = dot_fusion->fused_expression_root();
 
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
index 436247be2a4b0b..c74b4dc181f7e0 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/status.h"
 
 namespace xla {
@@ -30,11 +31,11 @@ namespace gpu {
 bool HasDivisibleSuffixAllowingSplit(absl::Span<int64_t const> span,
                                      int64_t divisor);
 
-// Apply split K configuration from the tiling to the fusion instruction:
+// Apply split K configuration from the tiling config to the fusion instruction:
 // in addition to MakeDotComputationSplitKBatch on its computation add the
 // necessary reduction after it.
 Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
-                          const AutotuneResult::TritonGemmKey& tiling);
+                          const TritonGemmConfig& config);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index a03627ad8ee098..535f7357c8eae0 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
@@ -90,15 +91,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
 }
@@ -127,15 +122,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
 }
@@ -161,19 +150,13 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  EXPECT_THAT(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
-      tsl::testing::StatusIs(
-          tsl::error::CANCELLED,
-          absl::StrFormat(
-              "Operation non-distributive over addition after dot.")));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  EXPECT_THAT(MakeDotSplitKBatch(
+                  module->entry_computation()->root_instruction(), config),
+              tsl::testing::StatusIs(
+                  tsl::error::CANCELLED,
+                  absl::StrFormat(
+                      "Operation non-distributive over addition after dot.")));
 }
 
 TEST_F(SplitKTest, MakeSplitKWithNonDivisibleDimensionSize) {
@@ -198,15 +181,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(2);
-  key.set_num_stages(1);
-  key.set_num_warps(2);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 2, 1, 2);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, AvoidSplitKWithSlicedContractingDimension) {
@@ -227,19 +204,13 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(2);
-  key.set_num_stages(1);
-  key.set_num_warps(2);
-  EXPECT_THAT(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
-      tsl::testing::StatusIs(
-          tsl::error::CANCELLED,
-          absl::StrFormat(
-              "Sliced contracting dimension is not supported yet.")));
+  TritonGemmConfig config(16, 16, 16, 2, 1, 2);
+  EXPECT_THAT(MakeDotSplitKBatch(
+                  module->entry_computation()->root_instruction(), config),
+              tsl::testing::StatusIs(
+                  tsl::error::CANCELLED,
+                  absl::StrFormat(
+                      "Sliced contracting dimension is not supported yet.")));
 }
 
 TEST_F(SplitKTest, MakeSplitKWithNonStandardOutputLayout) {
@@ -265,16 +236,10 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
 
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
@@ -306,15 +271,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(32);
-  key.set_block_n(64);
-  key.set_block_k(64);
-  key.set_split_k(8);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(32, 64, 64, 8, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
 }
@@ -342,15 +301,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SupportsIndivisibleSimpleSplitK4) {
@@ -373,15 +326,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SupportsIndivisibleSimpleSplitK16) {
@@ -404,15 +351,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 16, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SupportsIndivisibleWithTranspose) {
@@ -436,15 +377,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 16, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SupportIndivisibleWithBroadcast) {
@@ -468,15 +403,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 16, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SupportsIndivisibleWithBitcast) {
@@ -500,15 +429,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 16, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 }
 
 TEST_F(SplitKTest, SkipSmallK) {
@@ -534,18 +457,12 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(128);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  EXPECT_THAT(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
-      tsl::testing::StatusIs(
-          tsl::error::CANCELLED,
-          "Too small divisible part of the contracting dimension."));
+  TritonGemmConfig config(16, 16, 128, 4, 1, 4);
+  EXPECT_THAT(MakeDotSplitKBatch(
+                  module->entry_computation()->root_instruction(), config),
+              tsl::testing::StatusIs(
+                  tsl::error::CANCELLED,
+                  "Too small divisible part of the contracting dimension."));
 }
 
 TEST_F(SplitKTest, FragmentedKSupported) {
@@ -570,24 +487,19 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(32);
-  key.set_block_n(32);
-  key.set_block_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-
+  TritonGemmConfig config(32, 32, 16, 1, 1, 4);
   // 5 divides the contracting dimension, but not its major subdimensions.
-  key.set_split_k(5);
+  config.split_k = 5;
   EXPECT_THAT(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(),
+                         config),
       tsl::testing::StatusIs(tsl::error::CANCELLED,
                              "Contracting dimension is too fragmented."));
 
   // 8 fits the constraints.
-  key.set_split_k(8);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  config.split_k = 8;
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
   const HloComputation* dot_computation = module->entry_computation()
@@ -597,7 +509,7 @@ ENTRY e {
   const HloInstruction* p0 = dot_computation->parameter_instruction(0);
   TF_ASSERT_OK_AND_ASSIGN(
       const auto analysis,
-      TritonFusionAnalysis::Execute(*dot_computation, key.split_k()));
+      TritonFusionAnalysis::Execute(*dot_computation, config.split_k));
   EXPECT_EQ(dot_computation->root_instruction()->shape(),
             ShapeUtil::MakeShapeWithDescendingLayout(F16, {8, 7, 5}));
   EXPECT_THAT(
@@ -628,16 +540,11 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  key.set_split_k(4);
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
   // Because HasDivisibleSuffixAllowingSplit({128, 3}, 4) == false.
   EXPECT_THAT(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(),
+                         config),
       tsl::testing::StatusIs(tsl::error::CANCELLED,
                              "Contracting dimension is too fragmented."));
 }
@@ -661,15 +568,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(2);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 2, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
   const HloComputation* dot_computation = module->entry_computation()
@@ -716,15 +617,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
 
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Convert(m::Reduce(m::Fusion(), m::Constant()))));
@@ -754,15 +649,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(16);
-  key.set_block_k(16);
-  key.set_split_k(4);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Convert(m::Reduce(m::Fusion(), m::Constant()))));
 }
@@ -786,15 +675,9 @@ ENTRY e {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(16);
-  key.set_block_n(128);
-  key.set_block_k(32);
-  key.set_split_k(8);
-  key.set_num_stages(1);
-  key.set_num_warps(4);
-  TF_EXPECT_OK(
-      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TritonGemmConfig config(16, 128, 32, 8, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
   const auto* transpose =
       Cast<HloTransposeInstruction>(module->entry_computation()
                                         ->root_instruction()
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 1d296f0bb7af35..ad4aebe9c4398f 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -146,6 +146,7 @@ xla_cc_test(
     deps = [
         ":gpu_codegen_test",
         "//xla:statusor",
+        "//xla:test",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:gpu_plugin",
@@ -697,9 +698,10 @@ xla_cc_test(
 xla_cc_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     tags = tf_cuda_tests_tags(),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + 
-                    if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
index 8aa677563babd7..66c7b6b3201c06 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
@@ -53,7 +53,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[3,2,2], y: f32[2,2]) -> f32[3,2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2,2]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[3,2,2]{2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas${{(lt\$matmul|gemm)}}",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -92,7 +92,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[3,2,2]) -> f32[3,2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,2,2]{2,1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[3,2,2]{2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK    :       custom_call_target="__cublas${{(lt\$matmul|gemm)}}",
 ; CHECK    :       backend_config={
 ; CHECK-DAG:         "alpha_real":1
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index dec11a3c7098ef..e0b61bf19b1d20 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/statusor.h"
+#include "xla/test.h"
 #include "xla/tests/filecheck.h"
 #include "xla/xla.pb.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -239,7 +240,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -277,7 +278,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,3], y: f32[3,4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -319,7 +320,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4,5]{2,1,0} parameter(1)
 ; CHECK-DAG:     [[BITCAST0:%[^ ]+]] = f32[2,12]{0,1} bitcast([[P0]])
 ; CHECK-DAG:     [[BITCAST1:%[^ ]+]] = f32[12,5]{1,0} bitcast([[P1]])
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,5]{1,0} custom-call([[BITCAST0]], [[BITCAST1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[BITCAST0]], [[BITCAST1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -358,7 +359,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[3,2], y: f32[3,4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -397,7 +398,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[5,3,2], y: f32[5,3,4]) -> f32[5,2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[5,3,2]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -436,7 +437,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,5,3], y: f32[5,3,4]) -> f32[5,2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,5,3]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -476,7 +477,7 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2,5]{2,1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
 ; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} transpose([[P0]])
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[FUSION]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[FUSION]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -518,7 +519,7 @@ ENTRY AddDotsFunc {
 ; CHECK:    [[BC0:%[^ ]+]] = f32[80000,3,2]{2,1,0} bitcast([[P0]])
 ; CHECK:    [[P1:%[^ ]+]] = f32[20000,4,3,4]{3,2,1,0} parameter(1)
 ; CHECK:    [[BC1:%[^ ]+]] = f32[80000,3,4]{2,1,0} bitcast([[P1]])
-; CHECK:    [[OUT:%[^ ]+]] = f32[80000,2,4]{2,1,0} custom-call([[BC0]], [[BC1]]),
+; CHECK:    [[GEMM:%[^ ]+]] = (f32[80000,2,4]{2,1,0}, s8[2560000]{0}) custom-call([[BC0]], [[BC1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -534,6 +535,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
 ; CHECK:           }
+; CHECK:   [[OUT:%[^ ]+]] = f32[80000,2,4]{2,1,0} get-tuple-element([[GEMM]]), index=0
 ; CHECK:   ROOT {{[^ ]+}} = f32[20000,4,2,4]{3,2,1,0} bitcast([[OUT]])
 )");
 }
@@ -557,7 +559,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,3], y: f32[3,4]) -> f32[4,2] {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} custom-call([[P1]], [[P0]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P1]], [[P0]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -596,7 +598,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[5,2,3], y: f32[5,3,4]) -> f32[2,5,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[5,2,3]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[5,2,4]{2,0,1} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -613,7 +615,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,5,4]{2,1,0} bitcast([[GEMM]])
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,5,4]{2,1,0} bitcast
 )");
 }
 
@@ -636,7 +638,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[5,2,3], y: f32[5,3,4]) -> f32[2,4,5] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[5,2,3]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -653,7 +655,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4,5]{2,1,0} [[OP:[^ ]+]]([[GEMM]])
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,4,5]{2,1,0} [[OP:[^ ]+]]
 )");
 }
 
@@ -678,7 +680,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -719,7 +721,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: c64[2,2], y: c64[2,2]) -> c64[2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = c64[2,2]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = c64[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = c64[2,2]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -758,7 +760,7 @@ ENTRY AddDotsFunc {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   MatchOptimizedHlo(hlo_text,
                     R"(
-; CHECK:    {{[^ ]+}} = f32[2,2]{1,0} custom-call({{[^,]+}}, {{[^)]+}}),
+; CHECK:    {{[^ ]+}} = {{.*}} custom-call({{[^,]+}}, {{[^)]+}}),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -798,7 +800,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -833,13 +835,13 @@ ENTRY bf16gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: bf16[16,8]{1,0} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+; CHECK: {{.*}} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
   )",
                       /*print_operand_shape=*/true);
   } else {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: bf16[12,8]{1,0} custom-call(bf16[12,4]{1,0} [[P0:%[^ ]+]], bf16[4,8]{1,0} [[P1:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+; CHECK: {{.*}} custom-call(bf16[12,4]{1,0} [[P0:%[^ ]+]], bf16[4,8]{1,0} [[P1:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
   )",
                       /*print_operand_shape=*/true);
   }
@@ -861,15 +863,21 @@ ENTRY bf16gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-    ; CHECK: bf16[3,8,8]{2,1,0} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+    ; CHECK: {{.*}} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
     )",
                       /*print_operand_shape=*/true);
-  } else {
+  } else if (GetParam()) {
     MatchOptimizedHlo(hlo_text,
                       R"(
     ; CHECK: ROOT [[OUT:%[^ ]+]] = bf16[3,4,2]{2,1,0} custom-call(bf16[3,3,4]{2,1,0} [[A:%[^ ]+]], bf16[3,3,2]{2,1,0} [[B:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
     )",
                       /*print_operand_shape=*/true);
+  } else {
+    MatchOptimizedHlo(hlo_text,
+                      R"(
+    ; CHECK: {{.*}} custom-call(bf16[3,3,4]{2,1,0} [[A:%[^ ]+]], bf16[3,3,2]{2,1,0} [[B:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+    )",
+                      /*print_operand_shape=*/true);
   }
 }
 
@@ -888,13 +896,13 @@ ENTRY int8gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
+; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
   )",
                       /*print_operand_shape=*/true);
   } else {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK: {{.*}} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   )",
                       /*print_operand_shape=*/true);
@@ -918,8 +926,18 @@ ENTRY main.4 {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK %custom-call = s32[8,4]{1,0} custom-call(s8[8,4]{1,0} %fusion.1, s8[4,4]{0,1} %bitcast.13), custom_call_target="__cublas$gemm", backend_config={"selected_algorithm":"0","alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
-; CHECK: ROOT %bitcast.1 = s32[1,8,4]{2,1,0} bitcast(s32[8,4]{1,0} %custom-call)
+; CHECK: [[GEMM:%[^ ]+]] = (s32[8,4]{1,0}, s8[128]{0}) custom-call(s8[8,4]{1,0} %fusion.1, s8[4,4]{0,1} %bitcast.13), custom_call_target="__cublas$gemm",
+; CHECK:   backend_config={
+; CHECK-DAG:   "selected_algorithm":"0"
+; CHECK-DAG:   "alpha_real":1
+; CHECK-DAG:   "alpha_imag":0
+; CHECK-DAG:   "beta":0
+; CHECK-DAG:   "dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]}
+; CHECK-DAG:   "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]}
+; CHECK-DAG:   "epilogue":"DEFAULT"
+; CHECK:   }
+; CHECK: [[RES:%[^ ]+]] = s32[8,4]{1,0} get-tuple-element((s32[8,4]{1,0}, s8[128]{0}) [[GEMM]]), index=0
+; CHECK: ROOT [[OUT:%[^ ]+]] = s32[1,8,4]{2,1,0} bitcast(s32[8,4]{1,0} [[RES]])
   )",
                       /*print_operand_shape=*/true);
   }
@@ -943,7 +961,7 @@ ENTRY int8gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
+; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:       "alpha_real":1
@@ -953,7 +971,7 @@ ENTRY int8gemm {
   } else {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK: {{.*}} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   )",
                       /*print_operand_shape=*/true);
@@ -977,7 +995,7 @@ ENTRY int8gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
+; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:       "alpha_real":1
@@ -988,7 +1006,7 @@ ENTRY int8gemm {
   } else {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[12,8]{1,0} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK: {{.*}} dot(s32[12,4]{1,0} [[A:%[^ ]+]], s32[4,8]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   )",
                       /*print_operand_shape=*/true);
@@ -1010,13 +1028,13 @@ ENTRY int8gemm {
   if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[16,12]{1,0} custom-call(s8[16,4]{1,0} [[A:%[^ ]+]], s8[4,12]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
+; CHECK: {{.*}} custom-call(s8[16,4]{1,0} [[A:%[^ ]+]], s8[4,12]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
   )",
                       /*print_operand_shape=*/true);
   } else {
     MatchOptimizedHlo(hlo_text,
                       R"(
-; CHECK: s32[13,9]{1,0} dot(s32[13,4]{1,0} [[A:%[^ ]+]], s32[4,9]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK: {{.*}} dot(s32[13,4]{1,0} [[A:%[^ ]+]], s32[4,9]{1,0} [[B:%[^ ]+]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   )",
                       /*print_operand_shape=*/true);
@@ -1109,8 +1127,9 @@ ENTRY test {
 
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall({"__cublas$gemm"})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall({"__cublas$gemm"}), 0)));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingC64ToC128) {
@@ -1132,8 +1151,9 @@ ENTRY test {
 
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall({"__cublas$gemm"})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall({"__cublas$gemm"}), 0)));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF16ToF32) {
@@ -1153,8 +1173,14 @@ ENTRY test {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall({CustomCallTarget()})));
+  if (GetParam()) {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::CustomCall({CustomCallTarget()})));
+  } else {
+    EXPECT_THAT(
+        module->entry_computation()->root_instruction(),
+        GmockMatch(m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0)));
+  }
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF16ToF64) {
@@ -1176,8 +1202,9 @@ ENTRY test {
 
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall({"__cublas$gemm"})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall({"__cublas$gemm"}), 0)));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF32ToF64) {
@@ -1199,8 +1226,9 @@ ENTRY test {
 
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall({"__cublas$gemm"})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall({"__cublas$gemm"}), 0)));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, DoNotUpconvertOutput) {
@@ -1226,8 +1254,14 @@ ENTRY main {
 
   // input fp16 and output fp32 combination is supported by legacy cublas and
   // cublasLt, expect GemmRewriter to fuse the convert into gemm.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  if (GetParam()) {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  } else {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::GetTupleElement(
+                    m::CustomCall({CustomCallTarget()}), 0))));
+  }
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UnsupportedMixTypeGemm) {
@@ -1253,8 +1287,14 @@ ENTRY main {
 
   // u8 is not supported by legacy cublas and cublasLt, expect
   // GemmRewriter to not fuse the convert into gemm.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  if (GetParam()) {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  } else {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::GetTupleElement(
+                    m::CustomCall({CustomCallTarget()}), 0))));
+  }
 }
 
 TEST_P(ParameterizedGemmRewriteTest, CheckIsGemmAliasedBeforeFusion) {
@@ -1283,8 +1323,14 @@ ENTRY main {
   // input fp16 and output fp32 combination is supported by legacy cublas and
   // cublasLt, but gemm output is already aliased with one of the input expect
   // GemmRewriter to not fuse the convert into gemm.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  if (GetParam()) {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
+  } else {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Convert(m::GetTupleElement(
+                    m::CustomCall({CustomCallTarget()}), 0))));
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(CublasTestsBothLegacyAndLt,
@@ -1333,9 +1379,11 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[2,2], param_2: f32[2,2]) -> f32[2,2] {
 ; CHECK-DAG:     [[X:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[Y:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], {{[^,)]+}}),
+; CHECK:         [[O:%[^ ]+]] = (f32[2,2]{1,0}, s8[16]{0}) custom-call([[X]], [[Y]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
 ; CHECK-DAG:         "alpha_imag":0
@@ -1351,6 +1399,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} get-tuple-element([[O]]), index=0
 )");
 }
 
@@ -1377,7 +1426,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[2,2], bias: f32[2,2]) -> f32[2,2] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[16]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -1416,7 +1465,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[2,2], y: f32[2,2], bias: f32[2,2]) -> f32[2,2] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[16]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -1459,9 +1508,11 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = (f32[2,2]{1,0}, f32[3,3]{1,0}) parameter(2)
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[2,2]{1,0} get-tuple-element([[P2]]), index=0
 ; CHECK-DAG:     [[BIAS_COPY:%[^ ]+]] = f32[2,2]{1,0} copy([[BIAS]])
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]], [[BIAS_COPY]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[16]{0}) custom-call([[P0]], [[P1]], [[BIAS_COPY]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1504,9 +1555,11 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[X:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[Y:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[2,2]{1,0} parameter(2)
-; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
+; CHECK:         [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[16]{0}) custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
 ; CHECK-DAG:         "alpha_imag":0
@@ -1546,7 +1599,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[1024,1024], y: f32[1024,1024], bias: f32[1024,1024]) -> f32[1024,1024] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[1024,1024]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[1024,1024]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[1024,1024]{1,0}, s8[4194304]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -1591,9 +1644,11 @@ ENTRY BF16GemmWithBias {
 ; CHECK-LABEL: ENTRY %BF16GemmWithBias (x: bf16[8,8], y: bf16[8,8], param_2: bf16[8,8]) -> bf16[8,8] {
 ; CHECK-DAG:    [[X:%[^ ]+]] = bf16[8,8]{1,0} parameter(0)
 ; CHECK-DAG:    [[Y:%[^ ]+]] = bf16[8,8]{1,0} parameter(1)
-; CHECK:        ROOT [[GEMM:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[X]], [[Y]], {{[^,)]+}}),
+; CHECK:        [[GEMM:%[^ ]+]] = (bf16[8,8]{1,0}, s8[128]{0}) custom-call([[X]], [[Y]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1638,9 +1693,11 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4], param_2: f32[2,4]) -> f32[2,4] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK:         ROOT [[GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], {{[^,)]+}}),
+; CHECK:         [[GEMM:%[^ ]+]] = (f32[2,4]{1,0}, s8[32]{0}) custom-call([[P0]], [[P1]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1683,7 +1740,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-DAG:     [[P2:%[^ ]+]] = f32[2,3]{1,0} parameter(2)
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[3,4]{1,0} parameter(3)
-; CHECK-NEXT:    [[FIRST_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[FIRST_GEMM:%[^ ]+]] = (f32[2,4]{1,0}, s8[32]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -1700,9 +1757,12 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[SECOND_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P2]], [[P3]], [[FIRST_GEMM]]),
+; CHECK:         [[FIRST_GEMM_OUT:%[^ ]+]] = f32[2,4]{1,0} get-tuple-element([[FIRST_GEMM]]), index=0
+; CHECK-NEXT:    [[SECOND_GEMM:%[^ ]+]] = (f32[2,4]{1,0}, s8[32]{0}) custom-call([[P2]], [[P3]], [[FIRST_GEMM_OUT]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK-SAME:        {0}: (2, {})
+; CHECK-SAME:      }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1753,8 +1813,10 @@ ENTRY test {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                         m::Negate(m::Parameter(2)))));
+                GmockMatch(m::GetTupleElement(
+                    m::CustomCall(m::Parameter(0), m::Parameter(1),
+                                  m::Negate(m::Parameter(2))),
+                    0)));
   }
 }
 
@@ -1788,8 +1850,10 @@ ENTRY test {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                         m::Negate(m::Parameter(2)))));
+                GmockMatch(m::GetTupleElement(
+                    m::CustomCall(m::Parameter(0), m::Parameter(1),
+                                  m::Negate(m::Parameter(2))),
+                    0)));
   }
 }
 #endif
@@ -1818,7 +1882,9 @@ ENTRY test {
       optimized_module->entry_computation()->root_instruction(),
       GmockMatch(m::Fusion(
           m::Parameter(2),
-          m::CustomCall({"__cublas$gemm"}, m::Parameter(0), m::Parameter(1)))));
+          m::GetTupleElement(m::CustomCall({"__cublas$gemm"}, m::Parameter(0),
+                                           m::Parameter(1)),
+                             0))));
 }
 
 // Test batch gemm matrix bias add fusion with mix type that is not supported
@@ -1847,7 +1913,9 @@ ENTRY test {
       optimized_module->entry_computation()->root_instruction(),
       GmockMatch(m::Fusion(
           m::Parameter(2),
-          m::CustomCall({"__cublas$gemm"}, m::Parameter(0), m::Parameter(1)))));
+          m::GetTupleElement(m::CustomCall({"__cublas$gemm"}, m::Parameter(0),
+                                           m::Parameter(1)),
+                             0))));
 }
 
 TEST_F(LegacyCublasGemmRewriteTest, MergeBitcastAndAdd) {
@@ -1872,8 +1940,11 @@ ENTRY test {
       module->entry_computation()->root_instruction(),
       GmockMatch(
           m::Bitcast(
-              m::CustomCall({"__cublas$gemm"}, m::Parameter(0), m::Parameter(1),
-                            m::Bitcast(m::Parameter(2)).WithShape(F32, {2, 2})))
+              m::GetTupleElement(
+                  m::CustomCall(
+                      {"__cublas$gemm"}, m::Parameter(0), m::Parameter(1),
+                      m::Bitcast(m::Parameter(2)).WithShape(F32, {2, 2})),
+                  0))
               .WithShape(F32, {4})));
 }
 
@@ -1919,11 +1990,18 @@ ENTRY test {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::Tuple(
-          m::CustomCall(m::Parameter(0), m::Parameter(1),
-                        m::Negate(m::Parameter(2))),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()))));
+          m::GetTupleElement(m::CustomCall(m::Parameter(0), m::Parameter(1),
+                                           m::Negate(m::Parameter(2))),
+                             0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0))));
 }
 
 #if GOOGLE_CUDA
@@ -4719,7 +4797,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreAda) {
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %PreAdaTest (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16]) -> f8e4m3fn[16,16] {
-; CHECK:    {{.*}} = f16[16,16]{1,0} custom-call({{.*}}, {{.*}})
+; CHECK:    {{.*}} = {{.*}} custom-call({{.*}}, {{.*}})
 ; CHECK-DAG:  custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
           )");
 }
@@ -4747,7 +4825,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
 ; CHECK-NEXT:    [[P0_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e5m2[16,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P1]])
-; CHECK-NEXT:    [[DOT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0_CONVERT]], [[P1_CONVERT]]),
+; CHECK-NEXT:    [[DOT:%[^ ]+]] = {{.*}} custom-call([[P0_CONVERT]], [[P1_CONVERT]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4764,7 +4842,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e5m2[16,16]{1,0} convert([[DOT]])
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f8e5m2[16,16]{1,0} convert
       )",
                                                 replacements_));
 }
@@ -5848,11 +5926,11 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
 ; CHECK-DAG:         "beta":0
-; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:         "dot_dimension_numbers":{
 ; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "lhs_batch_dimensions":[]
-; CHECK-DAG:           "rhs_batch_dimensions":[] 
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "precision_config":{
 ; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
@@ -5910,11 +5988,11 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
 ; CHECK-DAG:         "beta":0
-; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:         "dot_dimension_numbers":{
 ; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "lhs_batch_dimensions":[]
-; CHECK-DAG:           "rhs_batch_dimensions":[] 
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "precision_config":{
 ; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
@@ -6605,11 +6683,11 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
 ; CHECK-DAG:         "beta":1
-; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:         "dot_dimension_numbers":{
 ; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
 ; CHECK-DAG:           "lhs_batch_dimensions":[]
-; CHECK-DAG:           "rhs_batch_dimensions":[] 
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "precision_config":{
 ; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
@@ -7070,7 +7148,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3_B:%[^ ]+]] = f32[32,16]{1,0} broadcast([[P3]]), dimensions={}
 ; CHECK-NEXT:    [[P1_UNSCALED:%[^ ]+]] = f32[32,16]{1,0} multiply([[P1_CV]], [[P3_B]])
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_UNSCALED]], [[P1_UNSCALED]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[P0_UNSCALED]], [[P1_UNSCALED]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7132,7 +7210,7 @@ class GemmRewriteAllocationTest : public GpuCodegenTest {
         static_cast<GpuExecutable*>(executable.get());
     absl::Span<const BufferAllocation> allocations =
         gpu_executable->GetAllocations();
-    CHECK_EQ(allocations.size(), expected_number_of_allocations);
+    ASSERT_EQ(allocations.size(), expected_number_of_allocations);
   }
 };
 
@@ -7151,7 +7229,7 @@ ENTRY AddDotsFunc {
 )";
 
   // Bias should be fused into the multiplication.
-  CheckNumberOfAllocations(hlo_text, 3);
+  CheckNumberOfAllocations(hlo_text, 4);
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc
index 0b097c9a2323d7..bd6bb0001c45e4 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_all_gather_optimizer_test.cc
@@ -208,6 +208,25 @@ add.2 = bf16[8,128,1024]{2,1,0} add(all-gather, add.1)
   EXPECT_EQ(CollectiveCount<HloOpcode::kReduceScatter>(module), 1);
 }
 
+TEST_F(GpuAllGatherOptimizerTest, DifferentOperandShapes) {
+  absl::string_view hlo_string = R"(
+HloModule TestModule
+
+ENTRY main {
+param.1 = bf16[8,64,128]{2,1,0} parameter(0)
+param.2 = bf16[8,128,64]{2,1,0} parameter(1)
+all-gather.1 = bf16[8,128,128]{2,1,0} all-gather(param.1), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={1}, use_global_device_ids=true
+all-gather.2 = bf16[8,128,128]{2,1,0} all-gather(param.2), channel_id=5, replica_groups={{0,1},{2,3},{4,5},{6,7}}, dimensions={2}, use_global_device_ids=true
+add.1 = bf16[8,128,128]{2,1,0} add(all-gather.1, all-gather.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, RunPass(hlo_string,
+                                               /*num_replicas=*/8,
+                                               /*num_partitions=*/1,
+                                               /*expect_change=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 4c9990fcc84444..2d6cca3c51b24a 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -69,7 +69,7 @@ ENTRY TestComputation {
 }
 
 // Size of the radix sort tests.
-static constexpr int kRadixSortTestSize = 100;
+static constexpr int kRadixSortTestSize = 100000;
 
 template <typename T>
 bool CheckOrder(T lhs, T rhs, bool asc, int pos) {
@@ -106,9 +106,9 @@ HloModule TestModule
 
 ENTRY %main {
   %input = $0[$1] parameter(0)
-  %sort = ($0[$1], u8[1000]) custom-call(%input),
+  %sort = ($0[$1], u8[$2]) custom-call(%input),
       custom_call_target="__cub$$DeviceRadixSort",
-      backend_config="{\"descending\": $2}"
+      backend_config="{\"descending\": $3}"
   ROOT %gte = get-tuple-element(%sort), index=0
 }
 )";
@@ -117,7 +117,9 @@ ENTRY %main {
   std::string hlo = absl::Substitute(
       kHloTemplate,
       GetTypeName(std::get<0>(GetParam())->shape().element_type()),
-      kRadixSortTestSize, ascending ? "false" : "true");
+      kRadixSortTestSize,
+      kRadixSortTestSize * 10,  // added scratch buffer size
+      ascending ? "false" : "true");
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   std::vector<Literal*> literals = {std::get<0>(GetParam()).get()};
@@ -143,9 +145,9 @@ HloModule TestModule
 ENTRY %main {
   %keys = $0[$2] parameter(0)
   %values = $1[$2] convert(%keys)
-  ROOT %sort = ($0[$2], $1[$2], u8[1000]) custom-call(%keys, %values),
+  ROOT %sort = ($0[$2], $1[$2], u8[$3]) custom-call(%keys, %values),
       custom_call_target="__cub$$DeviceRadixSort",
-      backend_config="{\"descending\": $3}"
+      backend_config="{\"descending\": $4}"
 }
 )";
 
@@ -154,6 +156,7 @@ ENTRY %main {
       kHloTemplate,
       GetTypeName(std::get<0>(GetParam())->shape().element_type()),
       GetTypeName(std::get<1>(GetParam())), kRadixSortTestSize,
+      kRadixSortTestSize * 20,  // added scratch buffer size
       ascending ? "false" : "true");
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 1c5c585d5eb7c9..3845407254b9e0 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/instruction_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_module_config.h"
@@ -74,6 +75,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "tsl/lib/core/bits.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
@@ -95,21 +97,8 @@ using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 namespace {
 
-// Constructs an autotuning key for a gemm performed in Triton.
-static AutotuneResult::TritonGemmKey GemmKey(int64_t block_m, int64_t block_n,
-                                             int64_t block_k, int64_t split_k,
-                                             int64_t num_stages,
-                                             int64_t num_warps) {
-  AutotuneResult::TritonGemmKey key;
-  key.set_block_m(block_m);
-  key.set_block_n(block_n);
-  key.set_block_k(block_k);
-  key.set_split_k(split_k);
-  key.set_num_stages(num_stages);
-  key.set_num_warps(num_warps);
-  return key;
-}
-
+// Currently supported minimum tile size.
+constexpr int kMinTileSize = 16;
 // Not a hard limit, just an assumption that should stay valid.
 constexpr int kMaxTileSize = 512;
 
@@ -157,10 +146,10 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
     // This cannot be the "else" branch of the previous "if".
     if (backend_config.has_triton_gemm_config()) {
-      const AutotuneResult::TritonGemmKey& tiling =
-          backend_config.triton_gemm_config();
-      if (tiling.split_k() > 1) {
-        TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, tiling));
+      const TritonGemmConfig config =
+          TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+      if (config.split_k > 1) {
+        TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, config));
       }
     }
 
@@ -174,11 +163,11 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
 // This contains all alternative Triton GEMM configs related to one fusion.
 struct GemmConfigSet {
-  std::vector<AutotuneResult::TritonGemmKey> configs;
+  std::vector<TritonGemmConfig> configs;
 };
 
 struct ExecutableCandidate {
-  AutotuneResult::TritonGemmKey config;
+  TritonGemmConfig config;
   // Not nullptr.
   std::unique_ptr<Executable> executable;
 };
@@ -256,9 +245,9 @@ constexpr std::array<int, 4> NUM_STAGES = {1, 2, 3, 4};
 constexpr std::array<int, 4> NUM_WARPS = {2, 4, 8, 16};
 constexpr std::array<int, 5> SPLIT_K = {1, 2, 4, 8, 16};
 
-std::vector<AutotuneResult::TritonGemmKey> GetExhaustiveMatmulAutotuneConfigs(
+std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
     const se::CudaComputeCapability compute_capability, const int max_split_k) {
-  std::vector<AutotuneResult::TritonGemmKey> configs;
+  std::vector<TritonGemmConfig> configs;
   bool mma_layout_v2 =
       compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE);
   for (int num_warps : NUM_WARPS) {
@@ -278,8 +267,8 @@ std::vector<AutotuneResult::TritonGemmKey> GetExhaustiveMatmulAutotuneConfigs(
               if (split_k > max_split_k) {
                 continue;
               }
-              auto config = GemmKey(block_m, block_n, block_k, split_k,
-                                    num_stages, num_warps);
+              auto config = TritonGemmConfig(block_m, block_n, block_k, split_k,
+                                             num_stages, num_warps);
               configs.push_back(std::move(config));
             }
           }
@@ -290,54 +279,95 @@ std::vector<AutotuneResult::TritonGemmKey> GetExhaustiveMatmulAutotuneConfigs(
   return configs;
 }
 
-std::vector<AutotuneResult::TritonGemmKey> GetFixedMatmulAutotuneConfigs(
+std::vector<TritonGemmConfig> GetFixedMatmulAutotuneConfigs(
+    const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability, const int max_split_k) {
-  std::vector<AutotuneResult::TritonGemmKey> configs = {
-      GemmKey(32, 32, 256, 1, 1, 4), GemmKey(64, 32, 32, 16, 1, 4),
-      GemmKey(32, 64, 64, 4, 1, 4),  GemmKey(128, 128, 64, 4, 1, 4),
-      GemmKey(16, 16, 256, 1, 1, 4), GemmKey(16, 128, 32, 16, 1, 4),
-      GemmKey(16, 64, 128, 1, 1, 4), GemmKey(16, 128, 32, 8, 1, 4),
-      GemmKey(16, 16, 512, 1, 1, 4), GemmKey(32, 16, 512, 1, 1, 4),
-      GemmKey(64, 32, 64, 1, 2, 8)};
+  // Shorter name for better formatting.
+  using Config = TritonGemmConfig;
+  std::vector<Config> configs = {
+      Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
+      Config(32, 64, 64, 4, 1, 4),  Config(128, 128, 64, 4, 1, 4),
+      Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4),
+      Config(16, 64, 128, 1, 1, 4), Config(16, 128, 32, 8, 1, 4),
+      Config(16, 16, 512, 1, 1, 4), Config(32, 16, 512, 1, 1, 4),
+      Config(64, 32, 64, 1, 2, 8)};
   if (compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     absl::c_copy(
-        std::vector<AutotuneResult::TritonGemmKey>{
-            GemmKey(128, 256, 32, 1, 3, 8),  GemmKey(256, 128, 32, 1, 3, 8),
-            GemmKey(256, 64, 32, 1, 4, 4),   GemmKey(64, 256, 32, 1, 4, 4),
-            GemmKey(128, 64, 32, 1, 4, 4),   GemmKey(64, 128, 32, 1, 4, 4),
-            GemmKey(256, 128, 128, 1, 3, 8), GemmKey(256, 64, 128, 1, 4, 4),
-            GemmKey(64, 256, 128, 1, 4, 4),  GemmKey(128, 128, 128, 1, 4, 4),
-            GemmKey(128, 64, 64, 1, 4, 4),   GemmKey(64, 128, 64, 1, 4, 4),
-            GemmKey(128, 32, 64, 1, 4, 4),   GemmKey(64, 32, 64, 1, 4, 4),
-            GemmKey(32, 128, 32, 1, 4, 4),   GemmKey(128, 128, 32, 1, 4, 4),
-            GemmKey(16, 16, 256, 1, 3, 4),   GemmKey(128, 128, 64, 2, 1, 8),
-            GemmKey(64, 64, 64, 1, 2, 4),    GemmKey(16, 64, 256, 8, 1, 4),
-            GemmKey(256, 256, 128, 1, 3, 8)},
+        std::vector<Config>{
+            Config(128, 256, 32, 1, 3, 8),  Config(256, 128, 32, 1, 3, 8),
+            Config(256, 64, 32, 1, 4, 4),   Config(64, 256, 32, 1, 4, 4),
+            Config(128, 64, 32, 1, 4, 4),   Config(64, 128, 32, 1, 4, 4),
+            Config(256, 128, 128, 1, 3, 8), Config(256, 64, 128, 1, 4, 4),
+            Config(64, 256, 128, 1, 4, 4),  Config(128, 128, 128, 1, 4, 4),
+            Config(128, 64, 64, 1, 4, 4),   Config(64, 128, 64, 1, 4, 4),
+            Config(128, 32, 64, 1, 4, 4),   Config(64, 32, 64, 1, 4, 4),
+            Config(32, 128, 32, 1, 4, 4),   Config(128, 128, 32, 1, 4, 4),
+            Config(16, 16, 256, 1, 3, 4),   Config(128, 128, 64, 2, 1, 8),
+            Config(64, 64, 64, 1, 2, 4),    Config(16, 64, 256, 8, 1, 4),
+            Config(256, 256, 128, 1, 3, 8)},
         std::back_inserter(configs));
   }
   if (compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)) {
     configs.erase(
         std::remove_if(configs.begin(), configs.end(),
-                       [](const AutotuneResult::TritonGemmKey& config) {
-                         return (config.block_m() * config.block_n() / 256) %
-                                    config.num_warps() !=
+                       [](const Config& config) {
+                         return (config.block_m * config.block_n / 256) %
+                                    config.num_warps !=
                                 0;
                        }),
         configs.end());
   }
-  configs.erase(
-      std::remove_if(configs.begin(), configs.end(),
-                     [&](const AutotuneResult::TritonGemmKey& config) {
-                       return config.split_k() > max_split_k;
-                     }),
-      configs.end());
+  configs.erase(std::remove_if(configs.begin(), configs.end(),
+                               [&](const Config& config) {
+                                 return config.split_k > max_split_k;
+                               }),
+                configs.end());
+
+  // This is not a sharp upper limit, the actual m value can be much smaller
+  // based on how much of the m dimension is physically contiguous.
+  // TODO(tdanyluk): Get the exact m value by running a TritonFusionAnalysis.
+  const int64_t m = dot.operand(0)->shape().dimensions(
+      NonContractingDimensionIndex(dot, /*operand_number=*/0));
+  // Theoretically the same is true as for m, but that is not possible in
+  // practice with the current implementation.
+  const int64_t n = dot.operand(1)->shape().dimensions(
+      NonContractingDimensionIndex(dot, /*operand_number=*/1));
+  // This is before doing the split-k transform.
+  const int64_t k = dot.operand(0)->shape().dimensions(
+      ContractingDimensionIndex(dot, /*operand_number=*/0));
+  const int64_t block_m_limit =
+      std::max<int64_t>(tsl::NextPowerOfTwoS64(m), kMinTileSize);
+  const int64_t block_n_limit =
+      std::max<int64_t>(tsl::NextPowerOfTwoS64(n), kMinTileSize);
+  const int64_t block_k_limit =
+      std::max<int64_t>(tsl::NextPowerOfTwoS64(k), kMinTileSize);
+
+  // Decrease the block sizes and split_k if they are unnecessarily big.
+  for (TritonGemmConfig& config : configs) {
+    config.block_m = std::min<int64_t>(config.block_m, block_m_limit);
+    config.block_n = std::min<int64_t>(config.block_n, block_n_limit);
+    config.block_k = std::min<int64_t>(config.block_k, block_k_limit);
+
+    const int64_t split_k_limit =
+        std::max<int64_t>(block_k_limit / config.block_k, 1);
+    config.split_k = std::min<int64_t>(config.split_k, split_k_limit);
+  }
+
+  // Remove duplicates.
+  absl::flat_hash_set<TritonGemmConfig> configs_so_far;
+  configs.erase(std::remove_if(configs.begin(), configs.end(),
+                               [&](const TritonGemmConfig& config) {
+                                 return !configs_so_far.insert(config).second;
+                               }),
+                configs.end());
+  CHECK(!configs.empty());
   return configs;
 }
 
 int GetLogEveryN() { return VLOG_IS_ON(3) ? 100 : 1000; }
 
 StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
-    const AutotuneResult::TritonGemmKey& key,
+    const TritonGemmConfig& config,
     const se::DeviceDescription& gpu_device_info,
     const HloFusionInstruction* fusion, DebugOptions debug_opts) {
   std::unique_ptr<HloModule> new_module =
@@ -351,11 +381,11 @@ StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
 
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       cloned_dot_fusion->backend_config<FusionBackendConfig>());
-  *backend_config.mutable_triton_gemm_config() = key;
+  *backend_config.mutable_triton_gemm_config() = config.ToProto();
   TF_RETURN_IF_ERROR(cloned_dot_fusion->set_backend_config(backend_config));
 
-  if (key.split_k() > 1) {
-    TF_RETURN_IF_ERROR(MakeDotSplitKBatch(cloned_dot_fusion, key));
+  if (config.split_k > 1) {
+    TF_RETURN_IF_ERROR(MakeDotSplitKBatch(cloned_dot_fusion, config));
     GpuFloatSupport bf16_support(BF16);
     FloatNormalization float_normalization(&bf16_support);
     TF_RETURN_IF_ERROR(float_normalization.Run(new_module.get()).status());
@@ -440,12 +470,11 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
   };
 
   // Returns true on success.
-  auto compile =
-      [&](const HloFusionInstruction* fusion,
-          const AutotuneResult::TritonGemmKey& conf) -> StatusOr<bool> {
-    CHECK(conf.block_m() <= kMaxTileSize);
-    CHECK(conf.block_n() <= kMaxTileSize);
-    CHECK(conf.block_k() <= kMaxTileSize);
+  auto compile = [&](const HloFusionInstruction* fusion,
+                     const TritonGemmConfig& conf) -> StatusOr<bool> {
+    CHECK_LE(conf.block_m, kMaxTileSize);
+    CHECK_LE(conf.block_n, kMaxTileSize);
+    CHECK_LE(conf.block_k, kMaxTileSize);
     // TODO(b/296884861): Reenable GPU runtime, when it will have much smaller
     // memory overhead (regarding the size of the executables).
     // We can also remove the force_disable_gpu_runtime argument at that
@@ -506,13 +535,12 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
       const HloFusionInstruction* fusion = key_value.first;
       const GemmConfigSet& gemm_config_set = key_value.second;
 
-      for (const AutotuneResult::TritonGemmKey& conf :
-           gemm_config_set.configs) {
+      for (const TritonGemmConfig& conf : gemm_config_set.configs) {
         thread_pool->Schedule([&, fusion] {
           StatusOr<bool> has_executable = compile(fusion, conf);
           TF_CHECK_OK(has_executable.status())
               << "Failure occured when compiling fusion " << fusion->name()
-              << " with config '" << conf.ShortDebugString()
+              << " with config '" << conf.ToString()
               << "'\nFused HLO computation:\n"
               << fusion->fused_instructions_computation()->ToString();
           log(has_executable.value());
@@ -543,8 +571,7 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
       const HloFusionInstruction* fusion = key_value.first;
       const GemmConfigSet& gemm_config_set = key_value.second;
 
-      for (const AutotuneResult::TritonGemmKey& gemm_config :
-           gemm_config_set.configs) {
+      for (const TritonGemmConfig& gemm_config : gemm_config_set.configs) {
         TF_ASSIGN_OR_RETURN(bool has_executable, compile(fusion, gemm_config));
         log(has_executable);
       }
@@ -636,10 +663,10 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   VLOG(2) << "Running " << executable_count << " configs for " << fusion->name()
           << ".";
   for (const ExecutableCandidate& candidate : executable_set.candidates) {
-    VLOG(5) << "Trying triton tiling: " << candidate.config.ShortDebugString();
+    VLOG(5) << "Trying triton tiling: " << candidate.config.ToString();
 
     AutotuneResult res;
-    *res.mutable_triton() = candidate.config;
+    *res.mutable_triton() = candidate.config.ToProto();
 
     TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> profiling_output,
                         util.ProfileExecutable(candidate.executable.get(),
@@ -659,7 +686,7 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     if (profiling_output->duration >= absl::Seconds(1)) {
       LOG(WARNING) << "Slow kernel for " << fusion->name()
                    << " took: " << profiling_output->duration
-                   << ". config: " << candidate.config.ShortDebugString();
+                   << ". config: " << candidate.config.ToString();
     }
     *res.mutable_run_time() =
         tsl::proto_utils::ToDurationProto(profiling_output->duration);
@@ -730,7 +757,7 @@ Status DumpAutotunedFusions(const AutotuneConfig& config,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       util.ExtractModule([&](const DebugOptions& debug_opts) {
                         return TritonGemmAutotuneExtractor(
-                            result.triton(),
+                            TritonGemmConfig::FromProto(result.triton()),
                             config.GetExecutor()->GetDeviceDescription(),
                             fusion, debug_opts);
                       }));
@@ -760,6 +787,17 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
       executable_sets,
       CompileMany(config, util, thread_pool, debug_opts, gemm_config_sets));
 
+  // Sort the candidates to make their execution order well-defined for each
+  // fusion.
+  for (auto& key_value : executable_sets) {
+    ExecutableSet& executable_set = key_value.second;
+    std::vector<ExecutableCandidate>& candidates = executable_set.candidates;
+    absl::c_sort(candidates, [](const ExecutableCandidate& a,
+                                const ExecutableCandidate& b) {
+      return a.config < b.config;
+    });
+  }
+
   for (const auto& key_value : executable_sets) {
     const HloFusionInstruction* fusion = key_value.first;
     const ExecutableSet& executable_set = key_value.second;
@@ -788,7 +826,7 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
 
 }  // anonymous namespace
 
-std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
+std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search) {
@@ -796,7 +834,7 @@ std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
   constexpr int kMinGemmElements = 32 * 32;
   if (ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
       ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements) {
-    return {GemmKey(32, 32, 32, 1, 1, 4)};
+    return {TritonGemmConfig(32, 32, 32, 1, 1, 4)};
   }
   // Split-K optimization enables more even utilization of a GPU in cases
   // where tiling just the non-contracting dimensions of a GEMM does not create
@@ -814,10 +852,10 @@ std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
                                       kMaxTileSize /
                                       ShapeUtil::ElementsIn(dot.shape()))
           : 1;
-  return exhaustive_tiling_search
-             ? GetExhaustiveMatmulAutotuneConfigs(compute_capability,
-                                                  max_split_k)
-             : GetFixedMatmulAutotuneConfigs(compute_capability, max_split_k);
+  return exhaustive_tiling_search ? GetExhaustiveMatmulAutotuneConfigs(
+                                        compute_capability, max_split_k)
+                                  : GetFixedMatmulAutotuneConfigs(
+                                        dot, compute_capability, max_split_k);
 }
 
 StatusOr<bool> TritonAutotuner::Run(
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.h b/third_party/xla/xla/service/gpu/triton_autotuner.h
index bff2f170e6e7b2..9e1e07f9d787e7 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.h
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
@@ -54,7 +55,7 @@ class TritonAutotuner : public HloModulePass {
 
 // TODO(b/266210099): have a way to generate/load these dynamically.
 // Returns a list of possible tilings for a GEMM performed in Triton.
-std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
+std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search = false);
 
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index e6221f1c9bd0d1..83a001277f8f2e 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_rewriter_triton.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/pattern_matcher.h"
@@ -247,15 +248,14 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::VOLTA, /*minor=*/0};
-  const std::vector<AutotuneResult::TritonGemmKey> configs =
+  const std::vector<TritonGemmConfig> configs =
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
-  EXPECT_FALSE(std::any_of(configs.begin(), configs.end(),
-                           [](const AutotuneResult::TritonGemmKey& key) {
-                             return key.num_stages() > 2;
-                           }));
+  EXPECT_FALSE(std::any_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
 }
 
 TEST_F(TritonAutotunerTest, AmpereUsesMoreThanTwoStages) {
@@ -269,15 +269,14 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<AutotuneResult::TritonGemmKey> configs =
+  const std::vector<TritonGemmConfig> configs =
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
-  EXPECT_TRUE(std::any_of(configs.begin(), configs.end(),
-                          [](const AutotuneResult::TritonGemmKey& key) {
-                            return key.num_stages() > 2;
-                          }));
+  EXPECT_TRUE(std::any_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
 }
 
 TEST_F(TritonAutotunerTest, SmallOutputCanUseLargeSplitK) {
@@ -291,15 +290,14 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<AutotuneResult::TritonGemmKey> configs =
+  const std::vector<TritonGemmConfig> configs =
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
-  EXPECT_TRUE(std::any_of(configs.begin(), configs.end(),
-                          [](const AutotuneResult::TritonGemmKey& key) {
-                            return key.split_k() >= 16;
-                          }));
+  EXPECT_TRUE(std::any_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.split_k >= 16; }));
 }
 
 TEST_F(TritonAutotunerTest, LargeOutputDoesNotUseLargeSplitK) {
@@ -313,15 +311,14 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<AutotuneResult::TritonGemmKey> configs =
+  const std::vector<TritonGemmConfig> configs =
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
-  EXPECT_FALSE(std::any_of(configs.begin(), configs.end(),
-                           [](const AutotuneResult::TritonGemmKey& key) {
-                             return key.split_k() > 1;
-                           }));
+  EXPECT_FALSE(std::any_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.split_k > 1; }));
 }
 
 TEST_F(TritonAutotunerTest, Int8FusedGemm) {
@@ -677,15 +674,14 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<AutotuneResult::TritonGemmKey> configs =
+  const std::vector<TritonGemmConfig> configs =
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
           compute_capability, GetDebugOptionsForTest());
-  EXPECT_TRUE(std::all_of(configs.begin(), configs.end(),
-                          [](const AutotuneResult::TritonGemmKey& key) {
-                            return key.split_k() == 1;
-                          }));
+  EXPECT_TRUE(std::all_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.split_k == 1; }));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator.cc
index 516c1e5b12a439..1496df490595df 100644
--- a/third_party/xla/xla/service/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/map_util.h"
 #include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/time_utils.h"
 #include "xla/status.h"
 #include "xla/util.h"
 
@@ -927,14 +928,27 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::Slice(
 
 template <typename BufferType>
 void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
-    UpdateSliceStartTimes(const std::vector<int64_t>& start_times) {
-  CHECK_EQ(start_times.size(), num_slices());
+    UpdateExclusiveSliceStartTimes(
+        const std::vector<int64_t>& exclusive_start_times) {
+  std::vector<int64_t> inclusive_start_times = exclusive_start_times;
+  absl::c_for_each(inclusive_start_times,
+                   [](int64_t& t) { t = ExclusiveToInclusiveStartTime(t); });
+  UpdateInclusiveSliceStartTimes(inclusive_start_times);
+}
+
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+    UpdateInclusiveSliceStartTimes(
+        const std::vector<int64_t>& inclusive_start_times) {
+  CHECK_EQ(inclusive_start_times.size(), num_slices());
   CHECK(mutable_full_buffer_interval_ != nullptr);
-  mutable_full_buffer_interval_->start = start_times.front();
+  mutable_full_buffer_interval_->start = inclusive_start_times.front();
   for (size_t slice_time = 0; slice_time < num_slices(); ++slice_time) {
-    make_free_chunks_intervals_[slice_time].start = start_times[slice_time];
+    make_free_chunks_intervals_[slice_time].start =
+        inclusive_start_times[slice_time];
     if (slice_time != num_slices() - 1) {
-      make_free_chunks_intervals_[slice_time].end = start_times[slice_time + 1];
+      make_free_chunks_intervals_[slice_time].end =
+          ExclusiveToInclusiveEndTime(inclusive_start_times[slice_time + 1]);
     } else {
       make_free_chunks_intervals_[slice_time].end = full_buffer_interval_.end;
     }
diff --git a/third_party/xla/xla/service/heap_simulator.h b/third_party/xla/xla/service/heap_simulator.h
index 14f80a135c8051..b6b78d7a028039 100644
--- a/third_party/xla/xla/service/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator.h
@@ -482,10 +482,13 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
     //
     // REQUIRES:
     // - The SlicedBufferInterval was constructed using CreateMutableInterval.
-    // - start_times.size() == NumSlices()
-    // - start_times should be set such that it is permissible for any slice
-    //   size to map to any start time.
-    void UpdateSliceStartTimes(const std::vector<int64_t>& start_times);
+    // - *_start_times.size() == NumSlices()
+    // - *_start_times should be set such that it is permissible for any
+    //   slice size to map to any start time.
+    void UpdateExclusiveSliceStartTimes(
+        const std::vector<int64_t>& exclusive_start_times);
+    void UpdateInclusiveSliceStartTimes(
+        const std::vector<int64_t>& inclusive_start_times);
 
     // Updates the free time for all the slices.
     //
diff --git a/third_party/xla/xla/service/heap_simulator_test.cc b/third_party/xla/xla/service/heap_simulator_test.cc
index 5a4fa1dc95ff87..8ee842c3f1603a 100644
--- a/third_party/xla/xla/service/heap_simulator_test.cc
+++ b/third_party/xla/xla/service/heap_simulator_test.cc
@@ -1575,7 +1575,7 @@ TEST_F(FindGlobalDecreasingSizeBestFitTest, FindChunkCandidates) {
 
     // // Slice B.
     sliced_buffer_b.Slice({5, 5});
-    sliced_buffer_b.UpdateSliceStartTimes({25, 30});
+    sliced_buffer_b.UpdateInclusiveSliceStartTimes({25, 30});
 
     // Place and commit B (and C transitively via colocation). B should be
     // placed at an offset that accommodates C; however, it should not have the
@@ -2084,20 +2084,21 @@ TEST_F(SlicedBufferIntervalTest, Sliced) {
   EXPECT_THAT(mutable_sliced_buffer_interval_->SliceSizesSortedByOffset(),
               ::testing::ElementsAre(4, 5, 5, 6));
 
-  mutable_sliced_buffer_interval_->UpdateSliceStartTimes({100, 125, 150, 175});
+  mutable_sliced_buffer_interval_->UpdateInclusiveSliceStartTimes(
+      {100, 125, 150, 175});
 
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(0)),
             BufferIntervalToTuple(
-                {p0_value_.get(), 4, 100, 125, ColocationTy(), true}));
+                {p0_value_.get(), 4, 100, 124, ColocationTy(), true}));
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(1)),
             BufferIntervalToTuple(
-                {p0_value_.get(), 4, 125, 150, ColocationTy(), true}));
+                {p0_value_.get(), 4, 125, 149, ColocationTy(), true}));
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(2)),
             BufferIntervalToTuple(
-                {p0_value_.get(), 4, 150, 175, ColocationTy(), true}));
+                {p0_value_.get(), 4, 150, 174, ColocationTy(), true}));
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(3)),
             BufferIntervalToTuple({p0_value_.get(), 20, 175, 200,
@@ -2107,6 +2108,30 @@ TEST_F(SlicedBufferIntervalTest, Sliced) {
             BufferIntervalToTuple({p0_value_.get(), 20, 100, 200,
                                    ColocationTy({p1_value_.get()}), true}));
 
+  mutable_sliced_buffer_interval_->UpdateExclusiveSliceStartTimes(
+      {100, 125, 150, 175});
+
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(0)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 101, 125, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(1)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 126, 150, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(2)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 151, 175, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(3)),
+            BufferIntervalToTuple({p0_value_.get(), 20, 176, 200,
+                                   ColocationTy({p1_value_.get()}), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->full_buffer_interval()),
+            BufferIntervalToTuple({p0_value_.get(), 20, 101, 200,
+                                   ColocationTy({p1_value_.get()}), true}));
+
   mutable_sliced_buffer_interval_->UpdateEndTime(300);
 
   // Only the BufferInterval for the last slice time should have changed end
@@ -2115,11 +2140,11 @@ TEST_F(SlicedBufferIntervalTest, Sliced) {
             175);
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(3)),
-            BufferIntervalToTuple({p0_value_.get(), 20, 175, 300,
+            BufferIntervalToTuple({p0_value_.get(), 20, 176, 300,
                                    ColocationTy({p1_value_.get()}), true}));
   EXPECT_EQ(BufferIntervalToTuple(
                 mutable_sliced_buffer_interval_->full_buffer_interval()),
-            BufferIntervalToTuple({p0_value_.get(), 20, 100, 300,
+            BufferIntervalToTuple({p0_value_.get(), 20, 101, 300,
                                    ColocationTy({p1_value_.get()}), true}));
 }
 
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index b6252902957138..dfb029a54194fa 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -410,8 +410,16 @@ Status EinsumDepthAnalysis::HandleConditional(HloInstruction* conditional) {
   auto depth_iter = einsum_depth_map_.find(conditional);
   CHECK(depth_iter != einsum_depth_map_.end());
   const ShapeTree<int> depth_tree = depth_iter->second;
-  return HandleCalledComputation(*conditional->called_computations()[0],
-                                 depth_tree, conditional->operands());
+  // Conditionals have one more operand than the number of branches. The first
+  // operand is the pred.
+  TF_RETURN_IF_ERROR(
+      SetInstructionDepth(conditional->operands()[0], depth_tree));
+  for (int i = 0; i < conditional->branch_count(); ++i) {
+    TF_RETURN_IF_ERROR(
+        HandleCalledComputation(*conditional->called_computations()[i],
+                                depth_tree, {conditional->operands()[i + 1]}));
+  }
+  return OkStatus();
 }
 
 Status EinsumDepthAnalysis::HandleCalledComputation(
@@ -615,6 +623,7 @@ void HloValueSemanticsAnalysis::AnnotateWeights() {
     value_semantics_[parameter] = std::move(semantics_shape_tree);
   }
 }
+
 Status HloValueSemanticsAnalysis::RunOnComputation(
     const HloComputation& computation,
     absl::Span<const HloInstruction* const> operands) {
@@ -1154,8 +1163,9 @@ Status HloValueSemanticsPropagation::HandleCustomCall(
 Status HloValueSemanticsPropagation::HandleConditional(
     HloInstruction* conditional) {
   for (int i = 0; i < conditional->called_computations().size(); ++i) {
-    TF_RETURN_IF_ERROR(analysis_->RunOnComputation(
-        *conditional->called_computations()[i], conditional->operands()));
+    TF_RETURN_IF_ERROR(
+        analysis_->RunOnComputation(*conditional->called_computations()[i],
+                                    {conditional->operands()[i + 1]}));
   }
   HloComputation* computation = conditional->called_computations()[0];
   const ShapeTree<const HloValueSemantics*>& root_semantics =
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
index 41c934f48523a9..fd1704f9192e82 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
@@ -581,5 +581,44 @@ TEST_F(EinsumDepthAnalysisTest, MnistTrainingLoop) {
   EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "dot.85"), 0);
 }
 
+TEST_F(EinsumDepthAnalysisTest, HandleConditional) {
+  const char* const hlo_string = R"(
+    HloModule Module
+
+    branch0 {
+      tparam = f32[4] parameter(0)
+      ROOT tgte1 = f32[4] ceil(tparam)
+    }
+
+    branch1 {
+      fparam = f32[4] parameter(0)
+      %async-start = ((f32[4]), f32[4], s32[]) custom-call-start(f32[4] fparam), async_execution_thread="parallel_thread", custom_call_target="foo"
+      ROOT %async-done = f32[4] custom-call-done(((f32[4]), f32[4], s32[]) %async-start), async_execution_thread="parallel_thread", custom_call_target="foo"
+    }
+
+    branch2 {
+      sparam = f32[4] parameter(0)
+      ROOT sgte1 = f32[4] ceil(sparam)
+    }
+
+    ENTRY entry {
+      p0 = f32[4] parameter(0)
+      b0 = s32[] parameter(1)
+      ROOT conditional = f32[4] conditional(b0, p0, p0, p0),
+        branch_computations={branch0, branch1, branch2}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
+      EinsumDepthAnalysis::Run(*module->entry_computation()));
+  const EinsumDepthMap& einsum_depth_map =
+      einsum_depth_analysis->GetEinsumDepthMap();
+  HloComputation* computation = module->GetComputationWithName("entry");
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "conditional"),
+            0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 6fddf8ec7a8bbb..b3f6e7aac27e4b 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -143,10 +143,22 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value, name.data());
   } else {
-    auto cmp_ge = b->CreateFCmpOGE(lhs_value, rhs_value);
+    // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
+    // See also: IEEE Std 754-2008 5.11.
+    //
+    // This also works, but we wanted to make it similar to minimum.
+    // logic: isNaN(lhs) || lhs >= rhs ? lhs : rhs
+    //
+    // b->CreateMaximum() doesn't work on GPU before SM80.
+    //
+    // A test with a strange LLVM version breaks if we use OGT here, so we use
+    // OGE.
     auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = b->CreateOr(cmp_ge, lhs_is_nan);
-    return b->CreateSelect(sel_lhs, lhs_value, rhs_value, name.data());
+    auto rhs_is_not_nan = b->CreateFCmpOEQ(rhs_value, rhs_value);
+    auto lhs_is_ge = b->CreateFCmpOGE(lhs_value, rhs_value);
+    return b->CreateSelect(
+        b->CreateOr(lhs_is_nan, b->CreateAnd(rhs_is_not_nan, lhs_is_ge)),
+        lhs_value, rhs_value, name.data());
   }
 }
 
@@ -157,10 +169,23 @@ llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value, name.data());
   } else {
-    auto cmp_le = b->CreateFCmpOLE(lhs_value, rhs_value);
+    // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
+    // See also: IEEE Std 754-2008 5.11.
+    //
+    // This should also work, but the tests show that it doesn't work for
+    // minimum(x, NaN) on GPU:
+    // logic: isNaN(lhs) || lhs <= rhs ? lhs : rhs
+    //
+    // b->CreateMaximum() doesn't work on GPU before SM80.
+    //
+    // A test with a strange LLVM version breaks if we use OLT here, so we use
+    // OLE.
     auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = b->CreateOr(cmp_le, lhs_is_nan);
-    return b->CreateSelect(sel_lhs, lhs_value, rhs_value, name.data());
+    auto rhs_is_not_nan = b->CreateFCmpOEQ(rhs_value, rhs_value);
+    auto lhs_is_le = b->CreateFCmpOLE(lhs_value, rhs_value);
+    return b->CreateSelect(
+        b->CreateOr(lhs_is_nan, b->CreateAnd(rhs_is_not_nan, lhs_is_le)),
+        lhs_value, rhs_value, name.data());
   }
 }
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 595124884d0f3a..f1918b0256425c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
+        "//xla/service:time_utils",
         "//xla/service:tuple_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index 1215116670d538..aa129e88bb127f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -153,13 +153,13 @@ template <typename T>
 std::vector<const AllocationBlock*> SortAllocationBlocks(const T& container) {
   std::vector<const AllocationBlock*> result;
   result.insert(result.end(), container.begin(), container.end());
-  absl::c_sort(result,
-               [](const AllocationBlock* lhs, const AllocationBlock* rhs) {
-                 return std::make_tuple(lhs->start_time, lhs->end_time,
-                                        lhs->initial_offset, lhs->size) <
-                        std::make_tuple(rhs->start_time, rhs->end_time,
-                                        rhs->initial_offset, rhs->size);
-               });
+  absl::c_sort(
+      result, [](const AllocationBlock* lhs, const AllocationBlock* rhs) {
+        return std::make_tuple(lhs->inclusive_start_time, lhs->end_time,
+                               lhs->initial_offset, lhs->size) <
+               std::make_tuple(rhs->inclusive_start_time, rhs->end_time,
+                               rhs->initial_offset, rhs->size);
+      });
 
   return result;
 }
@@ -198,13 +198,14 @@ class BestFitRepacker
             allocation_block);
         need_allocation = false;
       }
-      full_buffer_interval_map_.insert(std::make_pair(
-          allocation_block, BufferInterval{allocation_block,
-                                           allocation_block->size,
-                                           allocation_block->start_time,
-                                           allocation_block->end_time,
-                                           {},
-                                           need_allocation}));
+      full_buffer_interval_map_.insert(
+          std::make_pair(allocation_block,
+                         BufferInterval{allocation_block,
+                                        allocation_block->size,
+                                        allocation_block->inclusive_start_time,
+                                        allocation_block->end_time,
+                                        {},
+                                        need_allocation}));
     }
 
     // Now that full_buffer_interval_map_ has full colocation specifications,
@@ -229,8 +230,8 @@ class BestFitRepacker
         CHECK(!original_slice_data.slices_sorted_by_offset.empty());
 
         sliced_buffer_interval.Slice(original_slice_data.SizesSortedByOffset());
-        sliced_buffer_interval.UpdateSliceStartTimes(
-            original_slice_data.SortedStartTimes());
+        sliced_buffer_interval.UpdateInclusiveSliceStartTimes(
+            original_slice_data.SortedInclusiveStartTimes());
       }
 
       // We use buffer_intervals_ to store the minimum buffer interval for
@@ -338,11 +339,11 @@ class BestFitRepacker
       repacked_slice_data->slices_sorted_by_offset.reserve(chunks.size());
 
       // Chunks and start times are sorted in start time order.
-      std::vector<int64_t> sorted_start_times =
-          original_slice_data.SortedStartTimes();
+      std::vector<int64_t> sorted_inclusive_start_times =
+          original_slice_data.SortedInclusiveStartTimes();
       for (int i = 0; i < chunks.size(); ++i) {
         const Chunk& chunk = chunks[i];
-        int64_t start_time = sorted_start_times[i];
+        int64_t start_time = sorted_inclusive_start_times[i];
         result_.heap_size = result_.UpdatedHeapSize(chunk);
         VLOG(2) << "Adding sliced chunk " << chunk.ToString() << " at ["
                 << start_time << ", " << allocation_block->end_time << "]";
@@ -361,9 +362,9 @@ class BestFitRepacker
       new_offset = chunks.front().offset;
       result_.heap_size = result_.UpdatedHeapSize(chunks.front());
       VLOG(2) << "Adding unsliced chunk " << chunks.front().ToString()
-              << " at [" << allocation_block->start_time << ", "
+              << " at [" << allocation_block->inclusive_start_time << ", "
               << allocation_block->end_time << ")";
-      interval_tree_.Add(allocation_block->start_time,
+      interval_tree_.Add(allocation_block->inclusive_start_time,
                          allocation_block->end_time, chunks.front());
     }
 
@@ -522,13 +523,13 @@ class BestFitRepacker
               block->repacked_slice_data->slices_sorted_by_offset[i];
           timed_chunks.push_back(
               TimedChunk{absl::StrCat(((int64_t)block), "_slice_", i), block,
-                         slice.start_time, block->end_time,
+                         slice.inclusive_start_time, block->end_time,
                          Chunk::FromOffsetSize(slice.offset, slice.size)});
         }
       } else {
         timed_chunks.push_back(
-            TimedChunk{absl::StrCat(((int64_t)block)), block, block->start_time,
-                       block->end_time,
+            TimedChunk{absl::StrCat(((int64_t)block)), block,
+                       block->inclusive_start_time, block->end_time,
                        Chunk::FromOffsetSize(block->offset, block->size)});
       }
     }
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index bf051d3a546cb3..36ba980ad3af87 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <functional>
 #include <iterator>
 #include <limits>
+#include <list>
 #include <memory>
 #include <optional>
 #include <ostream>
@@ -41,6 +42,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_live_range.h"
@@ -49,6 +51,7 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/memory_space_assignment/tuning_utils.h"
 #include "xla/service/memory_space_assignment/utils.h"
+#include "xla/service/time_utils.h"
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -240,11 +243,20 @@ bool IsCrossProgramPrefetchCandidate(const HloValue& value,
          });
 }
 
+struct CrossProgramPrefetchBufferSortValues {
+  int64_t latest_use = 0;
+  int64_t use_size = 0;
+};
+
 std::vector<MemorySpaceAssignment::BufferInterval>
 FindCrossProgramPrefetchCandidates(const HloAliasAnalysis& alias_analysis,
                                    const HloLiveRange& hlo_live_range,
                                    const Options& options) {
   std::vector<MemorySpaceAssignment::BufferInterval> candidates;
+  bool use_custom_compare = !options.default_cross_program_prefetch_heuristic ||
+                            !options.buffer_interval_compare;
+  absl::flat_hash_map<const HloValue*, CrossProgramPrefetchBufferSortValues>
+      buffer_sort_values;
   for (const HloBuffer& buffer : alias_analysis.buffers()) {
     CHECK_GE(buffer.values().size(), 1);
     const HloValue* value = buffer.values().at(0);
@@ -257,33 +269,40 @@ FindCrossProgramPrefetchCandidates(const HloAliasAnalysis& alias_analysis,
       interval.need_allocation = true;
       interval.colocations = {++buffer.values().begin(), buffer.values().end()};
       candidates.emplace_back(interval);
+
+      if (!use_custom_compare || buffer_sort_values.contains(value)) {
+        continue;
+      }
+      CrossProgramPrefetchBufferSortValues& sort_values =
+          buffer_sort_values[value];
+      absl::c_for_each(value->GetUses(), [&](const HloUse& use) {
+        auto it = hlo_live_range.instruction_schedule().find(use.instruction);
+        if (it == hlo_live_range.instruction_schedule().end()) {
+          return;
+        }
+        sort_values.latest_use = std::max(sort_values.latest_use, it->second);
+        sort_values.use_size +=
+            ShapeUtil::ElementsInRecursive(use.instruction->shape());
+      });
     }
   }
 
-  // The BufferIntervalCompare function used to sort buffers implements the
-  // greater-than operator so that the most beneficial buffers are allocated
-  // first. The size_compare function below hence uses the greater-than operator
-  // to pick the largest buffer.
-  auto size_compare = [](const auto& x, const auto& y) {
-    if (x.size == y.size) {
-      // When both buffers are of same size, we prefer the one that is used to
-      // produce larger tensors in its consumer instructions.
-      auto get_use_size =
-          [](const MemorySpaceAssignment::BufferInterval& bi) -> int64_t {
-        int64_t use_size = 0;
-        for (const auto& use : bi.buffer->GetUses()) {
-          use_size += ShapeUtil::ElementsInRecursive(use.instruction->shape());
-        }
-        return use_size;
+  auto custom_compare_tuple =
+      [&buffer_sort_values](
+          const MemorySpaceAssignment::BufferInterval& buffer_interval) {
+        const CrossProgramPrefetchBufferSortValues sort_values =
+            buffer_sort_values[buffer_interval.buffer];
+        return std::make_tuple(-buffer_interval.size, -sort_values.use_size,
+                               sort_values.latest_use,
+                               buffer_interval.buffer->id());
       };
-      return get_use_size(x) > get_use_size(y);
-    }
-    return x.size > y.size;
+  auto custom_compare = [&](const MemorySpaceAssignment::BufferInterval& x,
+                            const MemorySpaceAssignment::BufferInterval& y) {
+    return custom_compare_tuple(x) < custom_compare_tuple(y);
   };
-  auto& compare = options.default_cross_program_prefetch_heuristic &&
-                          options.buffer_interval_compare
-                      ? *options.buffer_interval_compare
-                      : size_compare;
+
+  auto& compare =
+      use_custom_compare ? custom_compare : *options.buffer_interval_compare;
 
   absl::c_sort(candidates, compare);
 
@@ -4424,7 +4443,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
         // Rarely, (e.g., when conditional true and false parameters are the
         // same), definition time can be the time of the conditional and use
         // time is the parameter use, which is less.
-        request.start_time = std::min(definition_time, use_time);
+        request.inclusive_start_time = std::min(definition_time, use_time);
         request.end_time = use_time;
         request.latest_prefetch_time = latest_prefetch_time;
         request.size = allocation_value.size();
@@ -4556,44 +4575,45 @@ bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b) {
 }
 
 void AsynchronousCopyOrdering::AddCopy(const AsynchronousCopy& copy) {
-  auto it = ranges_.find({copy.start_time, copy.end_time});
+  auto it = ranges_.find({copy.exclusive_start_time, copy.end_time});
   if (it != ranges_.end()) {
-    CHECK_EQ(it->first.start_time, copy.start_time);
+    CHECK_EQ(it->first.exclusive_start_time, copy.exclusive_start_time);
     CHECK(it->second.insert(copy).second);
   } else {
-    ranges_[{copy.start_time, copy.end_time}] = {copy};
+    ranges_[{copy.exclusive_start_time, copy.end_time}] = {copy};
   }
 }
 
 void AsynchronousCopyOrdering::RemoveCopy(const AsynchronousCopy& copy) {
-  auto copy_it = ranges_.find({copy.start_time, copy.end_time});
+  auto copy_it = ranges_.find({copy.exclusive_start_time, copy.end_time});
   CHECK(copy_it != ranges_.end());
-  CHECK_EQ(copy_it->first.start_time, copy.start_time);
+  CHECK_EQ(copy_it->first.exclusive_start_time, copy.exclusive_start_time);
   CHECK_EQ(copy_it->second.erase(copy), 1);
   if (copy_it->second.empty()) {
     ranges_.erase(copy_it);
   }
 }
 
-bool AsynchronousCopyOrdering::ViolatesOrdering(int64_t start_time,
+bool AsynchronousCopyOrdering::ViolatesOrdering(int64_t exclusive_start_time,
                                                 int64_t end_time) const {
   // We allow identical start and end times. It is enough to check for just the
   // start time in case we find a match in ranges_ because the found value will
   // either be identical to {start_time, estimated_end_time} (and this doesn't
   // violate) or its start_time will be smaller and estimated_end_time will be
   // larger (this violates).
-  auto copy_it = ranges_.find({start_time, end_time});
-  if (copy_it != ranges_.end() && copy_it->first.start_time != start_time) {
-    VLOG(4) << "Violates ordering: (" << start_time << ", " << end_time
-            << ") and (" << copy_it->first.start_time << ", "
-            << copy_it->first.end_time << ")";
+  auto copy_it = ranges_.find({exclusive_start_time, end_time});
+  if (copy_it != ranges_.end() &&
+      copy_it->first.exclusive_start_time != exclusive_start_time) {
+    VLOG(4) << "Violates ordering: (" << exclusive_start_time << ", "
+            << end_time << ") and (" << copy_it->first.exclusive_start_time
+            << ", " << copy_it->first.end_time << ")";
     return true;
   }
   return false;
 }
 
 bool AsynchronousCopyResource::ConsumeResource(
-    int64_t start_time, int64_t end_time, float resource,
+    int64_t exclusive_start_time, int64_t end_time, float resource,
     absl::flat_hash_map<int64_t, float>* delay_change_map,
     float resource_to_free) {
   std::list<AsynchronousCopy>::iterator current_copy = async_copies_.end();
@@ -4605,10 +4625,16 @@ bool AsynchronousCopyResource::ConsumeResource(
     // resource is modified below. We save its initial value for logging below.
     const float amount_requested = resource;
 
-    VLOG(3) << "Consume resource: start time = " << start_time
-            << ", end time = " << end_time << ", resource = " << resource
-            << ", delay = " << delay_[start_time + 1]
+    VLOG(3) << "Consume resource: start time_exclusive = "
+            << exclusive_start_time << ", end time = " << end_time
+            << ", resource = " << resource << ", delay = "
+            << delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)]
             << ", free = " << resource_to_free;
+    VLOG(5) << "Available resources: "
+            << VectorToString(
+                   GetCurrentResources(), /*include_indices=*/true,
+                   ExclusiveToInclusiveStartTime(exclusive_start_time),
+                   end_time);
 
     // Nothing to do if we're not adding or removing any resources.
     if (resource == 0.0 && resource_to_free == 0.0) {
@@ -4619,7 +4645,7 @@ bool AsynchronousCopyResource::ConsumeResource(
     // this copy would have to be delayed because of an earlier copy that wasn't
     // finished when this copy starts.
     if (current_copy == async_copies_.end()) {
-      resource += delay_[start_time + 1];
+      resource += delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)];
     }
 
     // Find the copy that is right after this one. If there are leftover
@@ -4629,7 +4655,8 @@ bool AsynchronousCopyResource::ConsumeResource(
     if (current_copy != async_copies_.end()) {
       next_copy = std::next(current_copy);
     } else {
-      auto async_copy_time_it = async_copy_time_map_.upper_bound(start_time);
+      auto async_copy_time_it =
+          async_copy_time_map_.upper_bound(exclusive_start_time);
       if (async_copy_time_it != async_copy_time_map_.end()) {
         next_copy = async_copy_time_it->second;
       }
@@ -4640,13 +4667,14 @@ bool AsynchronousCopyResource::ConsumeResource(
     // earlier in time).
     std::optional<float> delay_for_next_copy = std::nullopt;
     float resource_freed = 0.0;
-    for (int64_t time = start_time + 1; time < end_time && resource != 0;
-         ++time) {
+    for (int64_t time = ExclusiveToInclusiveStartTime(exclusive_start_time);
+         time < end_time && resource != 0; ++time) {
       // Iterate over the logical times that this copy spans. Note that the
       // start and end time ranges are exclusive.
       float used_resource = std::min(resource, initial_resources_[time]);
       if (next_copy != async_copies_.end() &&
-          next_copy->start_time == time - 1) {
+          next_copy->exclusive_start_time ==
+              InclusiveToExclusiveStartTime(time)) {
         // This is the time where the next copy begins. If the resource is
         // non-zero at this point, the copy didn't finish by the time the next
         // copy started, so the next copy would need to be pushed later in time.
@@ -4672,9 +4700,6 @@ bool AsynchronousCopyResource::ConsumeResource(
 
     // If resource isn't satisfied by the end, we didn't have enough resources.
     if (resource > 0) {
-      VLOG(5) << "Available resources: "
-              << VectorToString(GetCurrentResources(), /*include_indices=*/true,
-                                start_time + 1, end_time);
       VLOG(3) << "Doesn't have enough resource; requested resource = "
               << amount_requested << "; leftover resources = " << resource;
       return false;
@@ -4686,7 +4711,7 @@ bool AsynchronousCopyResource::ConsumeResource(
     // If this copy overlapped with another one, we run for another iteration
     // with the next copy  with the amount of resource that needs to be added or
     // removed.
-    start_time = next_copy->start_time;
+    exclusive_start_time = next_copy->exclusive_start_time;
     end_time = next_copy->end_time;
     resource = *delay_for_next_copy + next_copy->resource;
     current_copy = next_copy;
@@ -4694,11 +4719,13 @@ bool AsynchronousCopyResource::ConsumeResource(
 }
 
 void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
-  CHECK(ConsumeResource(copy.start_time, copy.end_time, copy.resource));
+  CHECK(
+      ConsumeResource(copy.exclusive_start_time, copy.end_time, copy.resource));
 
   // Find the iterator for the copy that would be right after this copy and put
   // this copy right before it in async_copies_.
-  auto async_copy_time_it = async_copy_time_map_.upper_bound(copy.start_time);
+  auto async_copy_time_it =
+      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
   auto insertion_it = (async_copy_time_it == async_copy_time_map_.end())
                           ? async_copies_.end()
                           : async_copy_time_it->second;
@@ -4709,9 +4736,9 @@ void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
   // start index. If there are multiple asynchronous copies that have the same
   // start time, the memory space assignment algorithm schedules them in the
   // same order that AddCopy was called.
-  if (async_copy_time_map_.find(copy.start_time) ==
+  if (async_copy_time_map_.find(copy.exclusive_start_time) ==
       async_copy_time_map_.end()) {
-    async_copy_time_map_[copy.start_time] = inserted_it;
+    async_copy_time_map_[copy.exclusive_start_time] = inserted_it;
   }
 }
 
@@ -4725,7 +4752,8 @@ void AsynchronousCopyResource::RemoveCopy(const AsynchronousCopy& copy) {
   // remove the copies until we find the copy we actually want to remove. After
   // we remove the copy that we actually want to remove, we add back the
   // temporarily removed copies one by one in the same order.
-  auto async_copy_time_it = async_copy_time_map_.upper_bound(copy.start_time);
+  auto async_copy_time_it =
+      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
   auto copy_it = (async_copy_time_it == async_copy_time_map_.end())
                      ? async_copies_.end()
                      : async_copy_time_it->second;
@@ -4736,10 +4764,10 @@ void AsynchronousCopyResource::RemoveCopy(const AsynchronousCopy& copy) {
   auto prev_copy_it = copy_it;
   for (; *copy_it != copy; copy_it = prev_copy_it) {
     CHECK(copy_it != async_copies_.begin());
-    CHECK_EQ(copy_it->start_time, copy.start_time);
+    CHECK_EQ(copy_it->exclusive_start_time, copy.exclusive_start_time);
     copies_to_add_back.push_front(*copy_it);
     VLOG(4) << "RemoveCopy found a copy to temporarily remove and add back: "
-            << copy_it->start_time << " " << copy_it->end_time << " "
+            << copy_it->exclusive_start_time << " " << copy_it->end_time << " "
             << copy_it->resource;
     prev_copy_it = std::prev(copy_it);
     RemoveCopy(copy_it);
@@ -4756,19 +4784,21 @@ void AsynchronousCopyResource::RemoveCopy(
     std::list<AsynchronousCopy>::iterator& copy_it) {
   // This method works only for the latest copy for the given start time.
   CHECK(std::next(copy_it) == async_copies_.end() ||
-        std::next(copy_it)->start_time > copy_it->start_time);
-  CHECK(ConsumeResource(copy_it->start_time, copy_it->end_time, /*resource=*/0,
+        std::next(copy_it)->exclusive_start_time >
+            copy_it->exclusive_start_time);
+  CHECK(ConsumeResource(copy_it->exclusive_start_time, copy_it->end_time,
+                        /*resource=*/0,
                         /*delay_change_map=*/nullptr,
                         /*resource_to_free=*/copy_it->resource));
   // If the copy to be removed is the value pointed by async_copy_time_map_, we
   // make the next copy with the same start time to be pointed by
   // async_copy_time_map_. If there are no such copies, we remove the key for
   // this copy start time.
-  int64_t start_time = copy_it->start_time;
-  auto async_copy_time_it = async_copy_time_map_.find(start_time);
+  int64_t exclusive_start_time = copy_it->exclusive_start_time;
+  auto async_copy_time_it = async_copy_time_map_.find(exclusive_start_time);
   if (copy_it == async_copy_time_it->second) {
     if (std::next(copy_it) != async_copies_.end() &&
-        std::next(copy_it)->start_time == start_time) {
+        std::next(copy_it)->exclusive_start_time == exclusive_start_time) {
       async_copy_time_it->second = std::next(copy_it);
     } else {
       async_copy_time_map_.erase(async_copy_time_it);
@@ -4777,11 +4807,12 @@ void AsynchronousCopyResource::RemoveCopy(
   async_copies_.erase(copy_it);
 }
 
-bool AsynchronousCopyResource::HasEnoughResource(int64_t start_time,
+bool AsynchronousCopyResource::HasEnoughResource(int64_t exclusive_start_time,
                                                  int64_t end_time,
                                                  float resource) {
   absl::flat_hash_map<int64_t, float> delay_changes;
-  bool result = ConsumeResource(start_time, end_time, resource, &delay_changes);
+  bool result =
+      ConsumeResource(exclusive_start_time, end_time, resource, &delay_changes);
   for (const auto& change_pair : delay_changes) {
     delay_[change_pair.first] = change_pair.second;
   }
@@ -4792,8 +4823,8 @@ bool AsynchronousCopyResource::HasEnoughResourceMultiCheck(
     const std::vector<ResourceSpec>& specs) {
   absl::flat_hash_map<int64_t, float> delay_changes;
   bool result = absl::c_all_of(specs, [&](const ResourceSpec& spec) {
-    return ConsumeResource(spec.start_time, spec.end_time, spec.resource,
-                           &delay_changes);
+    return ConsumeResource(spec.exclusive_start_time, spec.end_time,
+                           spec.resource, &delay_changes);
   });
   for (const auto& change_pair : delay_changes) {
     delay_[change_pair.first] = change_pair.second;
@@ -4835,11 +4866,12 @@ std::string AsynchronousCopyResource::Dump(
     if (copy.destination != memory_space_filter) {
       continue;
     }
-    int64_t overlap_start = std::max(start_time, copy.start_time);
+    int64_t overlap_start = std::max(start_time, copy.exclusive_start_time);
     int64_t overlap_end = std::min(end_time, copy.end_time);
     if (overlap_start < overlap_end) {
       lines.push_back(absl::StrCat(
-          "copy(id: ", copy.id, ", start: ", copy.start_time,
+          "copy(id: ", copy.id,
+          ", exclusive_start: ", copy.exclusive_start_time,
           ", end: ", copy.end_time, ", resource: ", copy.resource, ")"));
     }
     for (int i = overlap_start; i < overlap_end; ++i) {
@@ -4962,20 +4994,21 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
       options_.prefetch_interval_picker->LatestPrefetchStartTime(
           buffer->defining_position().shape(), last_use_time,
           end_of_program_prefetch_end_time, nullptr);
-  int64_t end_of_program_prefetch_start_time =
+  int64_t end_of_program_inclusive_prefetch_start_time =
       options_.prefetch_interval_picker->PreferredPrefetchStartTime(
           buffer->defining_position().shape(), last_use_time,
           end_of_program_prefetch_latest_start_time,
           end_of_program_prefetch_end_time);
   VLOG(2) << "last use time = " << last_use_time
-          << ", end-of-program prefetch start time = "
-          << end_of_program_prefetch_start_time;
+          << ", end-of-program inclusive prefetch start time = "
+          << end_of_program_inclusive_prefetch_start_time;
   float total_execution_time =
       options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
           0, instruction_schedule.size());
   float buffer_occupied_time =
       options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          end_of_program_prefetch_start_time, end_of_program_prefetch_end_time);
+          end_of_program_inclusive_prefetch_start_time,
+          end_of_program_prefetch_end_time);
   if (options_.cost_analysis) {
     buffer_occupied_time = std::max(buffer_occupied_time,
                                     options_.cost_analysis->GetAsyncCopyElapsed(
@@ -4995,14 +5028,17 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
       (options_.enable_cross_program_prefetch_freeing &&
        memory_pressure_ > options_.max_size_in_bytes &&
        buffer_occupied_ratio < kCrossProgramPrefetchOccupyFreeingLimit &&
-       end_of_program_prefetch_start_time > last_use_time &&
-       end_of_program_prefetch_start_time < end_of_program_prefetch_end_time);
+       end_of_program_inclusive_prefetch_start_time > last_use_time &&
+       end_of_program_inclusive_prefetch_start_time <
+           end_of_program_prefetch_end_time);
   int64_t cross_program_prefetch_end_time =
       free_buffer ? last_use_time : prefetch_candidate.end;
 
   AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate, chunk_candidate,
-               prefetch_candidate.start, cross_program_prefetch_end_time,
-               latest_prefetch_time, &allocations, /*aliased_offset=*/nullptr,
+               /*exclusive_start_time=*/
+               InclusiveToExclusiveStartTime(prefetch_candidate.start),
+               cross_program_prefetch_end_time, latest_prefetch_time,
+               &allocations, /*aliased_offset=*/nullptr,
                /*resource=*/0.0, cross_program_prefetch_index);
 
   absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
@@ -5013,7 +5049,9 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
     VLOG(2) << "Adding an end-of-program prefetch for freed "
                "cross-program-prefetched buffer.";
     AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate, chunk_candidate,
-                 end_of_program_prefetch_start_time,
+                 /*exclusive_start_time=*/
+                 InclusiveToExclusiveStartTime(
+                     end_of_program_inclusive_prefetch_start_time),
                  end_of_program_prefetch_end_time,
                  end_of_program_prefetch_end_time, &allocations,
                  cross_program_prefetch_offset,
@@ -5454,12 +5492,14 @@ void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
     MemorySpaceAssignmentRepacker::SlicedAllocationData original_slice_data;
     for (const SliceDetail* slice_detail : slice_details_sorted_by_offset) {
       CHECK_EQ(slice_detail->copy_start_after_time,
-               slice_detail->slice_decision.start_time);
+               slice_detail->slice_decision.exclusive_start_time);
       original_slice_data.slices_sorted_by_offset.push_back(
           MemorySpaceAssignmentRepacker::Slice{
               slice_detail->slice_decision.chunk.size,
               slice_detail->slice_decision.chunk.offset,
-              slice_detail->slice_decision.start_time});
+              /*inclusive_start_time=*/
+              ExclusiveToInclusiveStartTime(
+                  slice_detail->slice_decision.exclusive_start_time)});
     }
 
     allocation_block.original_slice_data = std::move(original_slice_data);
@@ -5489,7 +5529,7 @@ void AlternateMemoryBestFitHeap::ImportRepackedNonSlicedAllocation(
   block.initial_offset = repacked_offset;
   block.offset = -1;
   interval_tree_.Add(
-      block.start_time, block.end_time,
+      block.inclusive_start_time, block.end_time,
       HeapSimulator::Chunk::FromOffsetSize(repacked_offset, block.size));
 
   VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
@@ -5530,8 +5570,10 @@ void AlternateMemoryBestFitHeap::ImportRepackedSlicedAllocation(
   // we don't need to worry about modifying the chunks here.
   for (const SliceDetail& slice_detail :
        allocation->slice_details_sorted_by_start_time()) {
-    interval_tree_.Add(slice_detail.copy_start_after_time, block.end_time,
-                       slice_detail.slice_decision.chunk);
+    interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(slice_detail.copy_start_after_time),
+        block.end_time, slice_detail.slice_decision.chunk);
   }
 
   VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
@@ -5574,18 +5616,22 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks(
     }
     interval_tree_.Remove(interval.start, interval.end, chunk);
   }
-  for (const auto& interval : pending_async_copies_) {
-    if (interval.destination == MemorySpace::kAlternate) {
-      prefetch_interval_tree_.Remove(interval.start_time, interval.end_time,
-                                     kDummyChunk);
-      prefetch_async_copy_resource_.RemoveCopy(interval);
+  for (const AsynchronousCopy& async_copy : pending_async_copies_) {
+    if (async_copy.destination == MemorySpace::kAlternate) {
+      prefetch_interval_tree_.Remove(
+          /*start=*/
+          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
+          async_copy.end_time, kDummyChunk);
+      prefetch_async_copy_resource_.RemoveCopy(async_copy);
       if (options_.enforce_prefetch_fifo_order) {
-        async_copy_ordering_.RemoveCopy(interval);
+        async_copy_ordering_.RemoveCopy(async_copy);
       }
     } else {
-      eviction_interval_tree_.Remove(interval.start_time, interval.end_time,
-                                     kDummyChunk);
-      eviction_async_copy_resource_.RemoveCopy(interval);
+      eviction_interval_tree_.Remove(
+          /*start=*/
+          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
+          async_copy.end_time, kDummyChunk);
+      eviction_async_copy_resource_.RemoveCopy(async_copy);
     }
   }
   for (const auto& value_and_required_assignment :
@@ -5674,43 +5720,44 @@ void AlternateMemoryBestFitHeap::ClearPendingChunks() {
 void AlternateMemoryBestFitHeap::AddToPendingChunks(
     const BufferInterval& buffer_interval, const Chunk& chunk_candidate) {
   VLOG(3) << "Committing chunk: " << buffer_interval.start << "-"
-          << buffer_interval.end << " : [" << chunk_candidate.offset << ", "
-          << chunk_candidate.size << "]";
+          << buffer_interval.end << " : " << chunk_candidate.ToString();
   pending_chunks_.emplace_back(buffer_interval, chunk_candidate);
   for (int i = buffer_interval.start; i <= buffer_interval.end; ++i) {
     peak_memory_usage_[i] += chunk_candidate.size;
     CHECK_LE(peak_memory_usage_[i], options_.max_size_in_bytes)
         << "Peak memory usage at " << i
         << " exceeds the max size of alternate memory. "
-        << buffer_interval.start << "-" << buffer_interval.end << " : ["
-        << chunk_candidate.offset << ", " << chunk_candidate.size << "]";
+        << buffer_interval.start << "-" << buffer_interval.end << " : "
+        << chunk_candidate.ToString();
   }
   CommitChunk(buffer_interval, chunk_candidate);
 }
 
 std::optional<int>
-AlternateMemoryBestFitHeap::FindEarliestTimeToSatisfyPeakMemory(
-    int start_time, int end_time, int64_t size) const {
-  int earliest_time;
-  for (earliest_time = end_time;
-       earliest_time >= start_time &&
-       peak_memory_usage_[earliest_time] + size <= options_.max_size_in_bytes;
-       --earliest_time) {
-  }
-  if (earliest_time == end_time) {
-    return std::nullopt;
+AlternateMemoryBestFitHeap::FindEarliestExclusiveTimeToSatisfyPeakMemory(
+    int exclusive_start_time, int end_time, int64_t size) const {
+  std::optional<int> earliest_time_exclusive = std::nullopt;
+  for (int time_inclusive = ExclusiveToInclusiveEndTime(end_time);
+       time_inclusive > exclusive_start_time; --time_inclusive) {
+    if (peak_memory_usage_[time_inclusive] + size <=
+        options_.max_size_in_bytes) {
+      earliest_time_exclusive = InclusiveToExclusiveStartTime(time_inclusive);
+    } else {
+      break;
+    }
   }
-  return earliest_time + 1;
+
+  return earliest_time_exclusive;
 }
 
 AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     const AllocationRequest& request) {
   auto allocation_sequence =
       request.allocation_value->mutable_allocation_sequence();
-  // start_time == end_time is a special case where the value is consumed
-  // multiple times by the same instruction. We can just find the previous
-  // allocation and use that allocation.
-  if (request.start_time == request.end_time) {
+  // inclusive_start_time == end_time is a special case where the value is
+  // consumed multiple times by the same instruction. We can just find the
+  // previous allocation and use that allocation.
+  if (request.inclusive_start_time == request.end_time) {
     MemorySpaceAssignment::Allocation* allocation =
         GetLiveAllocationAt(*allocation_sequence, request.end_time);
     CHECK_NE(allocation, nullptr);
@@ -5721,14 +5768,14 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
   const HloPosition& defining_position =
       request.allocation_value->defining_position();
   VLOG(2) << "Finding allocation for "
-          << request.allocation_value->ToShortString() << " ("
-          << request.start_time << ", " << request.end_time
+          << request.allocation_value->ToShortString() << " ["
+          << request.inclusive_start_time << ", " << request.end_time
           << ") latest prefetch = " << request.latest_prefetch_time
           << " last use = " << request.allocation_value->uses().back().time
           << " use = " << request.use->hlo_use.ToString()
           << ". Size = " << request.size
           << ", def pos = " << defining_position.ToString();
-  CHECK_LE(request.start_time, request.end_time);
+  CHECK_LE(request.inclusive_start_time, request.end_time);
   if (VLOG_IS_ON(3) && options_.cost_analysis) {
     const HloPosition& defining_position =
         request.allocation_value->defining_position();
@@ -5753,7 +5800,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
   // memory, we cannot prefetch it because if we did, it would be in alternate
   // memory instead.
   auto required_assignment_at_start = RequiredMemoryAssignmentAt(
-      request.allocation_value->value(), request.start_time);
+      request.allocation_value->value(), request.inclusive_start_time);
   std::optional<MemorySpace> required_memory_space_at_start;
   if (required_assignment_at_start) {
     required_memory_space_at_start = required_assignment_at_start->memory_space;
@@ -5787,7 +5834,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
             return allocation->memory_space() == required_memory_space_at_start;
           });
       if (prev_allocation_it != allocation_sequence->rend()) {
-        (*prev_allocation_it)->set_end_time(request.start_time);
+        (*prev_allocation_it)->set_end_time(request.inclusive_start_time);
         needs_required_allocation = false;
       }
     }
@@ -5801,7 +5848,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
       allocation_sequence->push_back(
           std::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, required_assignment_at_start->memory_space,
-              aliased_chunk, request.start_time, request.start_time,
+              aliased_chunk, request.inclusive_start_time,
+              request.inclusive_start_time,
               /*is_scoped_allocation=*/false));
       if (required_assignment_at_start->memory_space ==
           MemorySpace::kAlternate) {
@@ -5848,7 +5896,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     allocation_sequence->push_back(
         std::make_unique<MemorySpaceAssignment::Allocation>(
             defining_position, MemorySpace::kDefault,
-            /*chunk=*/std::nullopt, request.start_time, request.end_time,
+            /*chunk=*/std::nullopt, request.inclusive_start_time,
+            request.end_time,
             /*is_scoped_allocation=*/false));
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   }
@@ -5950,8 +5999,9 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
 
 void AlternateMemoryBestFitHeap::AddAsyncCopy(
     MemorySpaceAssignment::Allocation& prev_allocation,
-    MemorySpace memory_space, std::optional<Chunk> chunk, int64_t start_time,
-    int64_t end_time, int64_t copy_done_schedule_before_time,
+    MemorySpace memory_space, std::optional<Chunk> chunk,
+    int64_t exclusive_start_time, int64_t end_time,
+    int64_t copy_done_schedule_before_time,
     MemorySpaceAssignment::AllocationSequence* allocations,
     AliasedOffset* aliased_offset, float resource,
     std::optional<int> cross_program_prefetch_index) {
@@ -5959,32 +6009,36 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
           << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
                   ? "default"
                   : "alternate")
-          << " memory between " << start_time << " and "
-          << copy_done_schedule_before_time << " keeping until " << end_time
+          << " memory in (" << exclusive_start_time << ", "
+          << copy_done_schedule_before_time << "), keeping until " << end_time
           << ", estimated copy resource is " << resource;
-  CHECK_LT(start_time, copy_done_schedule_before_time);
+  CHECK_LT(exclusive_start_time, copy_done_schedule_before_time);
 
   allocations->push_back(
       std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-          prev_allocation, memory_space, chunk, start_time, end_time,
+          prev_allocation, memory_space, chunk, exclusive_start_time, end_time,
           copy_done_schedule_before_time, cross_program_prefetch_index));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
-  pending_async_copies_.push_back({start_time, copy_done_schedule_before_time,
-                                   resource, memory_space,
-                                   next_async_copy_id_++});
+  pending_async_copies_.push_back({exclusive_start_time,
+                                   copy_done_schedule_before_time, resource,
+                                   memory_space, next_async_copy_id_++});
   if (memory_space == MemorySpaceAssignment::MemorySpace::kAlternate) {
-    prefetch_interval_tree_.Add(start_time, copy_done_schedule_before_time,
-                                kDummyChunk);
+    prefetch_interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_start_time),
+        copy_done_schedule_before_time, kDummyChunk);
     prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
     if (options_.enforce_prefetch_fifo_order) {
       async_copy_ordering_.AddCopy(pending_async_copies_.back());
     }
     CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
   } else {
-    eviction_interval_tree_.Add(start_time, copy_done_schedule_before_time,
-                                kDummyChunk);
+    eviction_interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_start_time),
+        copy_done_schedule_before_time, kDummyChunk);
     eviction_async_copy_resource_.AddCopy(pending_async_copies_.back());
   }
 }
@@ -6005,7 +6059,7 @@ std::string SliceTimesAndCopyResourcesToString(
 
   for (const auto& slice_decision : slice_decisions) {
     std::vector<std::string> details;
-    details.push_back(absl::StrCat(slice_decision.start_time));
+    details.push_back(absl::StrCat(slice_decision.exclusive_start_time));
     details.push_back(absl::StrCat(prefetch_end));
     details.push_back(absl::StrCat(allocation_end));
     details.push_back(absl::StrCat(slice_decision.copy_resource_consumed));
@@ -6033,10 +6087,10 @@ void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
           << SliceTimesAndCopyResourcesToString(
                  slice_decisions_sorted_by_start_time, prefetch_end_time,
                  allocation_end_time);
-  CHECK(absl::c_all_of(slice_decisions_sorted_by_start_time,
-                       [&](const auto& slice_decision) {
-                         return slice_decision.start_time < prefetch_end_time;
-                       }));
+  CHECK(absl::c_all_of(
+      slice_decisions_sorted_by_start_time, [&](const auto& slice_decision) {
+        return slice_decision.exclusive_start_time < prefetch_end_time;
+      }));
 
   allocations->push_back(
       std::make_unique<MemorySpaceAssignment::SlicedCopyAllocation>(
@@ -6048,12 +6102,12 @@ void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
   // the limit at any given time.
   for (const auto& slice_decision : slice_decisions_sorted_by_start_time) {
     pending_async_copies_.push_back(
-        {slice_decision.start_time, prefetch_end_time,
+        {slice_decision.exclusive_start_time, prefetch_end_time,
          slice_decision.copy_resource_consumed,
          MemorySpaceAssignment::MemorySpace::kAlternate,
          next_async_copy_id_++});
-    prefetch_interval_tree_.Add(slice_decision.start_time, prefetch_end_time,
-                                kDummyChunk);
+    prefetch_interval_tree_.Add(slice_decision.exclusive_start_time,
+                                prefetch_end_time, kDummyChunk);
     prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
     if (options_.enforce_prefetch_fifo_order) {
       async_copy_ordering_.AddCopy(pending_async_copies_.back());
@@ -6063,7 +6117,7 @@ void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
 }
 
 bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
-    int64_t start_time, int64_t end_time, bool is_prefetch,
+    int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
     int64_t extra_async_copy_limit, int64_t num_additional_copies) const {
   if (options_.max_outstanding_prefetches < 0 && is_prefetch) {
     return false;
@@ -6075,14 +6129,16 @@ bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
   // Count the prefetches/evictions in the interval tree for the given interval.
   if (is_prefetch) {
     int64_t num_prefetches =
-        prefetch_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+        prefetch_interval_tree_
+            .ChunksOverlappingInTime(inclusive_start_time, end_time)
             .size() +
         num_additional_copies;
     return num_prefetches >=
            options_.max_outstanding_prefetches + extra_async_copy_limit;
   } else {
     int64_t num_evictions =
-        eviction_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+        eviction_interval_tree_
+            .ChunksOverlappingInTime(inclusive_start_time, end_time)
             .size() +
         num_additional_copies;
     return num_evictions >=
@@ -6120,7 +6176,8 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   // duration checks.
   if (!request.prefer_no_copy_alternate_mem_allocation &&
       !options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
-          defining_position.shape(), request.start_time, request.end_time)) {
+          defining_position.shape(), request.inclusive_start_time,
+          request.end_time)) {
     VLOG(3) << "Live range is too long.";
     return Result::kFailLiveRangeTooLong;
   }
@@ -6129,7 +6186,7 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   alternate_mem_interval.buffer = request.allocation_value->value();
   alternate_mem_interval.size = request.size;
   alternate_mem_interval.end = request.end_time;
-  alternate_mem_interval.start = request.start_time;
+  alternate_mem_interval.start = request.inclusive_start_time;
 
   // Prefer the offset that was previously used for the previous allocation.
   AliasedOffset* preferred_offset = nullptr;
@@ -6182,7 +6239,9 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
             << ", heap_size = " << result_.UpdatedHeapSize(*chunk_candidate)
             << ", prefetch picker = "
             << options_.prefetch_interval_picker->ToNoCopyDebugString(
-                   defining_position.shape(), request.start_time,
+                   defining_position.shape(),
+                   /*start_time=*/
+                   InclusiveToExclusiveStartTime(request.inclusive_start_time),
                    request.end_time);
     AddToPendingChunks(alternate_mem_interval, *chunk_candidate);
 
@@ -6196,7 +6255,7 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
       request.allocation_value->mutable_allocation_sequence()->push_back(
           std::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, MemorySpace::kAlternate, chunk_candidate,
-              request.start_time, request.end_time,
+              request.inclusive_start_time, request.end_time,
               /*is_scoped_allocation=*/false));
       CreateOrAddToAliasedOffset(
           *request.allocation_value->allocation_sequence()->back(),
@@ -6218,14 +6277,27 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
   CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
   MemorySpaceAssignment::Allocation* prev_allocation =
       request.allocation_value->allocation_sequence()->back().get();
-  int64_t eviction_start_time = prev_allocation->start_time();
+  // We do not ever expect an Evict() to be immediately proceeded by a prefetch.
+  // If that case ever occurs, the eviction_exclusive_start_time below will be
+  // calculated incorrectly, as it will need to come after the prefetch finishes
+  // coping data.
+  CHECK(!prev_allocation->is_copy_like_allocation())
+      << "Evict has been given copy-like previous allocation.\nEvict "
+         "candidate:\n"
+      << request.allocation_value->ToString() << "\nPrevious allocation:\n"
+      << prev_allocation->ToString();
+
+  // The previous allocation's inclusive start time is the eviction's exclusive
+  // start time to ensure that the value is created before we start copying
+  // back to default memory.
+  int64_t eviction_exclusive_start_time = prev_allocation->start_time();
   int64_t eviction_end_time = prev_allocation->end_time();
-  CHECK(eviction_start_time <= eviction_end_time);
+  CHECK(eviction_exclusive_start_time <= eviction_end_time);
 
   int64_t preferred_eviction_end_time =
       std::max(options_.prefetch_interval_picker->PreferredEvictionEndTime(
                    request.allocation_value->defining_position().shape(),
-                   eviction_start_time, request.end_time),
+                   eviction_exclusive_start_time, request.end_time),
                eviction_end_time);
   // Evictions must complete by the time of this use.
   preferred_eviction_end_time =
@@ -6239,7 +6311,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
   eviction_mem_interval.start = eviction_end_time + 1;
   eviction_mem_interval.end = preferred_eviction_end_time;
   int64_t preferred_offset = prev_allocation->chunk().offset;
-  VLOG(3) << "Eviction (" << eviction_start_time << ", " << eviction_end_time
+  VLOG(3) << "Eviction (" << eviction_exclusive_start_time << ", "
+          << eviction_end_time
           << ") preferred end time = " << eviction_mem_interval.end;
 
   for (; eviction_mem_interval.end > eviction_end_time;
@@ -6254,7 +6327,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
   eviction_end_time = eviction_mem_interval.end;
 
   VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " ("
-          << eviction_start_time << ", " << eviction_end_time << ")";
+          << eviction_exclusive_start_time << ", " << eviction_end_time << ")";
 
   float eviction_resource =
       options_.cost_analysis
@@ -6262,10 +6335,11 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
                 request.allocation_value->defining_position().shape())
           : 0.1;
 
-  bool eviction_interval_too_short = (eviction_start_time == eviction_end_time);
+  bool eviction_interval_too_short =
+      (eviction_exclusive_start_time == eviction_end_time);
   bool eviction_violates_resource =
       !eviction_async_copy_resource_.HasEnoughResource(
-          eviction_start_time, eviction_end_time, eviction_resource);
+          eviction_exclusive_start_time, eviction_end_time, eviction_resource);
   if (eviction_violates_resource) {
     // If we're in the last retry, set resource to 0.
     if (options_.prefetch_interval_picker->retry_number() ==
@@ -6275,19 +6349,22 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
     }
     eviction_violates_resource =
         !eviction_async_copy_resource_.HasEnoughResource(
-            eviction_start_time, eviction_end_time, eviction_resource);
+            eviction_exclusive_start_time, eviction_end_time,
+            eviction_resource);
   }
   bool eviction_violates_outstanding_copies =
-      ViolatesMaximumOutstandingAsyncCopies(eviction_start_time,
-                                            eviction_end_time,
-                                            /*is_prefetch=*/false);
+      ViolatesMaximumOutstandingAsyncCopies(
+          /*inclusive_start_time=*/ExclusiveToInclusiveStartTime(
+              eviction_exclusive_start_time),
+          eviction_end_time,
+          /*is_prefetch=*/false);
 
   // See if this interval would violate the asynchronous copy limit.
   if (!eviction_interval_too_short && !eviction_violates_outstanding_copies &&
       !eviction_violates_resource) {
     prev_allocation->set_end_time(eviction_end_time);
     AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
-                 /*chunk=*/std::nullopt, eviction_start_time,
+                 /*chunk=*/std::nullopt, eviction_exclusive_start_time,
                  prev_allocation->end_time(), eviction_end_time,
                  request.allocation_value->mutable_allocation_sequence(),
                  /*aliased_offset=*/nullptr, eviction_resource);
@@ -6297,8 +6374,9 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
     } else if (eviction_violates_resource) {
       VLOG(3) << "This violates resource.";
     } else {
-      VLOG(3) << "Eviction interval is too short (" << eviction_start_time
-              << ", " << eviction_end_time << ").";
+      VLOG(3) << "Eviction interval is too short ("
+              << eviction_exclusive_start_time << ", " << eviction_end_time
+              << ").";
     }
     // If the original interval violated the limit, try sub-intervals within
     // this interval.
@@ -6309,17 +6387,16 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
       // kept in the default memory.
       VLOG(3) << "Bailing: Could not evict " << request.use->hlo_use.ToString()
               << " because we hit the limit of maximum asynchronous copies "
-              << "between "
+              << "between ("
               << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_start_time]
-              << " and "
+                     .instructions()[eviction_exclusive_start_time]
+              << ", "
               << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_end_time];
-      // return false;
+                     .instructions()[eviction_end_time]
+              << ")";
       return Result::kFailOutOfAsyncCopies;
     }
   }
-  // return true;
   return Result::kSuccess;
 }
 
@@ -6340,9 +6417,9 @@ std::string DescribeSlicedBufferMove(
   slice_strings.reserve(slice_decisions.size());
 
   for (const auto& slice_decision : slice_decisions) {
-    slice_strings.push_back(absl::StrCat("(", slice_decision.start_time, ", ",
-                                         slice_decision.chunk.offset, ", ",
-                                         slice_decision.chunk.size, ")"));
+    slice_strings.push_back(absl::StrCat(
+        "(", slice_decision.exclusive_start_time, ", ",
+        slice_decision.chunk.offset, ", ", slice_decision.chunk.size, ")"));
   }
 
   return absl::StrCat(
@@ -6410,10 +6487,12 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
   Result result = Result::kSuccess;
   while (!options_.prefetch_interval_picker->Done()) {
     // Get the prefetch start time from the interval picker.
-    context.prefetch_start_time = options_.prefetch_interval_picker->Next();
-    CHECK_LT(context.prefetch_start_time, context.prefetch_end_time);
-    if (context.out_of_mem_start.has_value() &&
-        context.prefetch_start_time <= *context.out_of_mem_start) {
+    context.exclusive_prefetch_start_time =
+        options_.prefetch_interval_picker->Next();
+    CHECK_LT(context.exclusive_prefetch_start_time, context.prefetch_end_time);
+    if (context.exclusive_out_of_mem_start.has_value() &&
+        context.exclusive_prefetch_start_time <=
+            *context.exclusive_out_of_mem_start) {
       VLOG(4) << "This would OOM (cached).";
       return Result::kFailOutOfMemory;
     }
@@ -6471,8 +6550,10 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
     return Result::kSuccess;
   }
   if (context.unsliced_solution) {
-    VLOG(3) << "Move the buffer to alternate memory at time "
-            << context.unsliced_solution_intervals.full.start << ". Offset = "
+    VLOG(3) << "Move the buffer to alternate memory after time "
+            << InclusiveToExclusiveStartTime(
+                   context.unsliced_solution_intervals.full.start)
+            << ". Offset = "
             << context.unsliced_solution->chunk_candidate.offset
             << ", size = " << context.unsliced_solution->chunk_candidate.size
             << ", heap_size = "
@@ -6485,7 +6566,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
     AddAsyncCopy(
         *context.prev_allocation_in_default_mem, MemorySpace::kAlternate,
         context.unsliced_solution->chunk_candidate,
-        context.unsliced_solution_intervals.full.start,
+        context.unsliced_solution_intervals.full.start - 1,
         context.request->end_time, context.prefetch_end_time,
         context.request->allocation_value->mutable_allocation_sequence(),
         context.request->preferred_offset,
@@ -6592,24 +6673,25 @@ void AlternateMemoryBestFitHeap::SetupPrefetchWorkingIntervalsAndSliceProposal(
 AlternateMemoryBestFitHeap::Result
 AlternateMemoryBestFitHeap::InitializePrefetchIntervalPicker(
     PrefetchContext& context) {
-  int64_t earliest_prefetch_time =
+  int64_t earliest_exclusive_prefetch_time =
       context.prev_allocation_in_default_mem->earliest_available_time();
   if (context.request->earliest_prefetch_time) {
-    earliest_prefetch_time = std::max(earliest_prefetch_time,
-                                      *context.request->earliest_prefetch_time);
+    earliest_exclusive_prefetch_time =
+        std::max(earliest_exclusive_prefetch_time,
+                 *context.request->earliest_prefetch_time);
   }
   context.prefetch_end_time =
-      FindPrefetchEndTime(*context.request, earliest_prefetch_time);
+      FindPrefetchEndTime(*context.request, earliest_exclusive_prefetch_time);
 
   // As a compile time optimization, use the peak memory usage to filter out
   // allocation times that would push us to OOM.
-  std::optional<int> earliest_non_oom_prefetch_time =
-      FindEarliestTimeToSatisfyPeakMemory(earliest_prefetch_time,
-                                          context.prefetch_end_time,
-                                          context.request->size);
-  if (!earliest_non_oom_prefetch_time) {
-    VLOG(3) << "Any prefetch in range (" << earliest_prefetch_time << ", "
-            << context.prefetch_end_time << ") for size "
+  std::optional<int> earliest_exclusive_non_oom_prefetch_time =
+      FindEarliestExclusiveTimeToSatisfyPeakMemory(
+          earliest_exclusive_prefetch_time, context.prefetch_end_time,
+          context.request->size);
+  if (!earliest_exclusive_non_oom_prefetch_time) {
+    VLOG(3) << "Any prefetch in range (" << earliest_exclusive_prefetch_time
+            << ", " << context.prefetch_end_time << ") for size "
             << context.request->size << " would go out of memory.";
     return Result::kFailOutOfMemory;
   }
@@ -6619,20 +6701,21 @@ AlternateMemoryBestFitHeap::InitializePrefetchIntervalPicker(
     // buffer will fit, but we may be able to start slices before that time. So,
     // we leave earliest_prefetch_time at its initial value.
     VLOG(4) << "After peak memory check, prefetch range is ("
-            << *earliest_non_oom_prefetch_time << ", "
+            << *earliest_exclusive_non_oom_prefetch_time << ", "
             << context.prefetch_end_time
             << "). Original earliest prefetch time is "
-            << earliest_prefetch_time;
-    earliest_prefetch_time = *earliest_non_oom_prefetch_time;
+            << earliest_exclusive_prefetch_time;
+    earliest_exclusive_prefetch_time =
+        *earliest_exclusive_non_oom_prefetch_time;
   }
   std::optional<int64_t> preferred_prefetch_time =
       context.request->preferred_prefetch_time;
   if (preferred_prefetch_time) {
     preferred_prefetch_time =
-        std::max(*preferred_prefetch_time, earliest_prefetch_time);
+        std::max(*preferred_prefetch_time, earliest_exclusive_prefetch_time);
   }
   options_.prefetch_interval_picker->Begin(
-      context.request->use->hlo_use, earliest_prefetch_time,
+      context.request->use->hlo_use, earliest_exclusive_prefetch_time,
       context.prefetch_end_time, preferred_prefetch_time);
   VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
@@ -6648,9 +6731,9 @@ AlternateMemoryBestFitHeap::EnsureSomeSpatialPrefetchFitExists(
            ? context.sliced_solution_intervals.sliced.get()
            : context.unsliced_solution_intervals.sliced.get());
 
-  // Note, UpdateSliceStartTimes() will correctly update start times for both
-  // sliced and unsliced solutions.
-  interval->UpdateSliceStartTimes(
+  // Note, UpdateInclusiveSliceStartTimes() will correctly update start times
+  // for both sliced and unsliced solutions.
+  interval->UpdateExclusiveSliceStartTimes(
       std::vector<int64_t>(interval->num_slices(),
                            options_.prefetch_interval_picker->latest_time()));
   std::vector<Chunk> chunk_candidates = FindBestChunkCandidates(
@@ -6729,7 +6812,8 @@ bool DoWeHaveEnoughCopyResource(
         absl::StrJoin(specs, ", ",
                       [](std::string* out,
                          const AsynchronousCopyResource::ResourceSpec& spec) {
-                        absl::StrAppend(out, "{start: ", spec.start_time,
+                        absl::StrAppend(out, "{exclusive start: ",
+                                        spec.exclusive_start_time,
                                         ", end: ", spec.end_time,
                                         ", resource: ", spec.resource, "}");
                       }),
@@ -6787,20 +6871,23 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   }
 
   // Update the prefetch start time in our working solution.
-  std::vector<int64_t> slice_start_times = PickSliceStartTimes(
-      sliced_buffer_interval->num_slices(), context.prefetch_start_time,
-      context.prefetch_end_time);
-  CHECK_EQ(sliced_buffer_interval->num_slices(), slice_start_times.size());
-  sliced_buffer_interval->UpdateSliceStartTimes(slice_start_times);
+  std::vector<int64_t> exclusive_slice_start_times = PickSliceStartTimes(
+      sliced_buffer_interval->num_slices(),
+      context.exclusive_prefetch_start_time, context.prefetch_end_time);
+  CHECK_EQ(sliced_buffer_interval->num_slices(),
+           exclusive_slice_start_times.size());
+  sliced_buffer_interval->UpdateExclusiveSliceStartTimes(
+      exclusive_slice_start_times);
   VLOG(4) << AlternateMemoryAllocationAttemptToString(for_sliced_solution,
                                                       context);
 
   // Check if all slices have the same start time. If so, we might as well
   // resort to a full copy.
   if (for_sliced_solution &&
-      absl::c_all_of(slice_start_times, [&](int64_t slice_start_time) {
-        return slice_start_time == slice_start_times.front();
-      })) {
+      absl::c_all_of(
+          exclusive_slice_start_times, [&](int64_t slice_start_time) {
+            return slice_start_time == exclusive_slice_start_times.front();
+          })) {
     return Result::kAllSlicesHaveTheSameStartTime;
   }
 
@@ -6811,7 +6898,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   // resources here.
   if (context.request->preferred_prefetch_time) {
     copy_resource_per_slice_sorted_by_start_time =
-        std::vector<float>(slice_start_times.size(), 0.0);
+        std::vector<float>(exclusive_slice_start_times.size(), 0.0);
   } else if (for_sliced_solution) {
     // In a sliced setting, we don't yet know when each slice will be
     // prefetched. Given the proposed slice times, the most conservative copy
@@ -6832,7 +6919,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   CHECK_EQ(sliced_buffer_interval->num_slices(),
            copy_resource_per_slice_sorted_by_start_time.size());
 
-  if (!DoWeHaveEnoughCopyResource(slice_start_times, context.prefetch_end_time,
+  if (!DoWeHaveEnoughCopyResource(exclusive_slice_start_times,
+                                  context.prefetch_end_time,
                                   copy_resource_per_slice_sorted_by_start_time,
                                   prefetch_async_copy_resource_)) {
     return Result::kFailViolatesAsyncCopyResource;
@@ -6841,20 +6929,20 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   // Check if the copies we would add for the prefetch would violate copy
   // ordering.
   if (options_.enforce_prefetch_fifo_order &&
-      std::any_of(slice_start_times.begin(), slice_start_times.end(),
-                  [&](int64_t slice_start_time) {
-                    return async_copy_ordering_.ViolatesOrdering(
-                        slice_start_time, context.prefetch_end_time);
-                  })) {
+      absl::c_any_of(exclusive_slice_start_times,
+                     [&](int64_t slice_start_time) {
+                       return async_copy_ordering_.ViolatesOrdering(
+                           slice_start_time, context.prefetch_end_time);
+                     })) {
     VLOG(4) << "This would violate asynchronous copy ordering.";
     return Result::kFailViolatesAsyncCopyResource;
   }
 
   // Check if the copies we would add for the prefetch violate the maximum
   // number of outstanding async copies.
-  for (int i = 0; i < slice_start_times.size(); ++i) {
+  for (int i = 0; i < exclusive_slice_start_times.size(); ++i) {
     if (ViolatesMaximumOutstandingAsyncCopies(
-            slice_start_times[i], context.prefetch_end_time,
+            exclusive_slice_start_times[i], context.prefetch_end_time,
             /*is_prefetch=*/true, context.extra_async_copy_limit, i)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
       return Result::kFailOutOfAsyncCopies;
@@ -6893,14 +6981,14 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
           CopyResourceForShape(options_, proposal.slice_shape);
       slice_decisions_sorted_by_start_time.push_back(
           MemorySpaceAssignment::SliceDecision{
-              chunk_candidates[slice_time], slice_start_times[slice_time],
-              proposal,
+              chunk_candidates[slice_time],
+              exclusive_slice_start_times[slice_time], proposal,
               copy_resource_per_slice_sorted_by_start_time[slice_time]});
     }
 
     // Check that we have enough copy resources for all the slice decisions.
     if (!DoWeHaveEnoughCopyResource(
-            slice_start_times, context.prefetch_end_time,
+            exclusive_slice_start_times, context.prefetch_end_time,
             copy_resource_per_slice_sorted_by_start_time,
             prefetch_async_copy_resource_)) {
       return Result::kFailViolatesAsyncCopyResource;
@@ -6921,7 +7009,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
     BufferInterval final_buffer_interval{
         context.request->allocation_value->value(),
         /*size=*/final_chunk.size,
-        /*start=*/slice_start_times.back(),
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back()),
         /*end=*/context.request->end_time,
         /*colocations=*/
         sliced_buffer_interval->full_buffer_interval().colocations,
@@ -6929,20 +7018,23 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
     for (int64_t slice_time = 0;
          slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
       const Chunk& chunk = chunk_candidates[slice_time];
-      int64_t start_time = slice_start_times[slice_time];
-      if (start_time == slice_start_times.back()) {
+      int64_t inclusive_start_time = ExclusiveToInclusiveStartTime(
+          exclusive_slice_start_times[slice_time]);
+      if (inclusive_start_time ==
+          ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back())) {
         // This and the following chunks will be merged into the final chunk.
         // Note, it's possible for more than one slice to start at the same
         // time.
         break;
       }
-      CHECK_LE(start_time, slice_start_times.back() - 1);
+      CHECK_LT(inclusive_start_time, ExclusiveToInclusiveStartTime(
+                                         exclusive_slice_start_times.back()));
       slices_for_pending_chunks.push_back(std::make_pair(
           BufferInterval{
               context.request->allocation_value->value(),
               /*size=*/chunk.size,
-              /*start=*/start_time,
-              /*end=*/slice_start_times.back() - 1,
+              /*start=*/inclusive_start_time,
+              /*end=*/exclusive_slice_start_times.back(),
               // We only use the final_buffer_interval for colocations because
               // slices start at different offsets, and the colocation
               // infrastructure expects all colocated buffers to start at the
@@ -6980,12 +7072,14 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   // Thus, if we are considering a sliced prefetch for the current request,
   // we can only update out_of_mem_start when we check with slices.
   if (for_sliced_solution || !context.slice_proposal_collection) {
-    CHECK_GT(slice_start_times.size(), 0);
-    context.out_of_mem_start =
-        std::max(context.out_of_mem_start ? *context.out_of_mem_start : -1,
-                 slice_start_times.front());
+    CHECK_GT(exclusive_slice_start_times.size(), 0);
+    context.exclusive_out_of_mem_start = std::max(
+        context.exclusive_out_of_mem_start ? *context.exclusive_out_of_mem_start
+                                           : -1,
+        exclusive_slice_start_times.front());
   }
 
+  VLOG(4) << "Out of memory.";
   return Result::kFailOutOfMemory;
 }
 
@@ -7513,10 +7607,11 @@ bool MemorySpaceAssignment::SliceProposal::operator==(
 }
 
 std::string MemorySpaceAssignment::SliceDecision::ToString() const {
-  return absl::StrCat(
-      "{ chunk: ", chunk.ToString(), ", start_time: ", start_time,
-      ", sizing: ", sizing.ToString(),
-      ", copy_resource_consumed: ", copy_resource_consumed, " }");
+  return absl::StrCat("{ chunk: ", chunk.ToString(),
+                      ", (exclusive) start_time: ", exclusive_start_time,
+                      ", sizing: ", sizing.ToString(),
+                      ", copy_resource_consumed: ", copy_resource_consumed,
+                      " }");
 }
 
 namespace {
@@ -7524,9 +7619,9 @@ namespace {
 std::tuple<const MemorySpaceAssignment::Chunk&, int64_t,
            const MemorySpaceAssignment::SliceProposal&, float>
 SliceDecisionToTuple(const MemorySpaceAssignment::SliceDecision& decision) {
-  return std::make_tuple(std::ref(decision.chunk), decision.start_time,
-                         std::ref(decision.sizing),
-                         decision.copy_resource_consumed);
+  return std::make_tuple(
+      std::ref(decision.chunk), decision.exclusive_start_time,
+      std::ref(decision.sizing), decision.copy_resource_consumed);
 }
 
 }  // namespace
@@ -7652,38 +7747,43 @@ std::optional<MemorySpaceAssignment::Chunk> GetSlicedCopyAllocationChunk(
 }
 
 // Helper function to compute the start time for a SlicedCopyAllocation.
-int64_t GetSlicedCopyAllocationStartTime(
+int64_t GetSlicedCopyAllocationExclusiveStartTime(
     const std::vector<MemorySpaceAssignment::SliceDecision>&
-        slice_decisions_sorted_by_start_time) {
-  if (slice_decisions_sorted_by_start_time.empty()) {
+        slice_decisions_sorted_by_exclusive_start_time) {
+  if (slice_decisions_sorted_by_exclusive_start_time.empty()) {
     return -1;
   }
 
-  return slice_decisions_sorted_by_start_time.front().start_time;
+  return slice_decisions_sorted_by_exclusive_start_time.front()
+      .exclusive_start_time;
 }
 
 }  // namespace
 
 MemorySpaceAssignment::SlicedCopyAllocation::SlicedCopyAllocation(
     const Allocation& prev_allocation, MemorySpace memory_space,
-    std::vector<SliceDecision> slice_decisions_sorted_by_start_time,
+    std::vector<SliceDecision> slice_decisions_sorted_by_exclusive_start_time,
     int64_t end_time, int64_t copy_done_schedule_before_time,
     absl::FunctionRef<void(Shape*)> update_layout_fn)
     : Allocation(
           /*defining_position=*/{nullptr, {}}, memory_space,
-          GetSlicedCopyAllocationChunk(slice_decisions_sorted_by_start_time),
-          GetSlicedCopyAllocationStartTime(
-              slice_decisions_sorted_by_start_time),
+          GetSlicedCopyAllocationChunk(
+              slice_decisions_sorted_by_exclusive_start_time),
+          // Allocation uses an inclusive start time
+          ExclusiveToInclusiveStartTime(
+              GetSlicedCopyAllocationExclusiveStartTime(
+                  slice_decisions_sorted_by_exclusive_start_time)),
           end_time,
           /*is_scoped_allocation=*/false),
       original_shape_to_slice_(prev_allocation.defining_position().shape()),
       prev_allocation_(prev_allocation),
       update_layout_fn_(update_layout_fn) {
-  CHECK_GE(slice_decisions_sorted_by_start_time.size(), 2);
+  CHECK_GE(slice_decisions_sorted_by_exclusive_start_time.size(), 2);
   slice_details_sorted_by_start_time_.reserve(
-      slice_decisions_sorted_by_start_time.size());
-  for (SliceDecision& decision : slice_decisions_sorted_by_start_time) {
-    int64_t copy_done_schedule_after_time = decision.start_time;
+      slice_decisions_sorted_by_exclusive_start_time.size());
+  for (SliceDecision& decision :
+       slice_decisions_sorted_by_exclusive_start_time) {
+    int64_t copy_done_schedule_after_time = decision.exclusive_start_time;
     slice_details_sorted_by_start_time_.push_back(SliceDetail{
         std::move(decision),
         copy_done_schedule_after_time,
@@ -7845,8 +7945,10 @@ void MemorySpaceAssignment::SlicedCopyAllocation::ImportRepackedSliceData(
     const MemorySpaceAssignmentRepacker::Slice& repacked_slice_data =
         data.slices_sorted_by_offset[i];
     chunk = Chunk::FromOffsetSize(repacked_slice_data.offset, chunk.size);
-    slice_detail->copy_start_after_time = repacked_slice_data.start_time;
-    slice_detail->slice_decision.start_time = repacked_slice_data.start_time;
+    slice_detail->copy_start_after_time =
+        repacked_slice_data.inclusive_start_time - 1;
+    slice_detail->slice_decision.exclusive_start_time =
+        InclusiveToExclusiveStartTime(repacked_slice_data.inclusive_start_time);
   }
 
   absl::c_sort(slice_details_sorted_by_start_time_,
@@ -7919,6 +8021,21 @@ std::string MemorySpaceAssignment::ParentAllocation::ToString() const {
                       original_allocation_.ToString());
 }
 
+MemorySpaceAssignment::CopyAllocation::CopyAllocation(
+    Allocation& prev_allocation, MemorySpace memory_space,
+    std::optional<Chunk> chunk, int64_t copy_start_schedule_after_time,
+    int64_t end_time, int64_t copy_done_schedule_before_time,
+    std::optional<int64_t> cross_program_prefetch_index)
+    : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
+                 // Allocation uses an inclusive start time
+                 ExclusiveToInclusiveStartTime(copy_start_schedule_after_time),
+                 end_time,
+                 /*is_scoped_allocation=*/false),
+      prev_allocation_(prev_allocation),
+      copy_start_schedule_after_(copy_start_schedule_after_time),
+      copy_done_schedule_before_(copy_done_schedule_before_time),
+      cross_program_prefetch_index_(cross_program_prefetch_index) {}
+
 Status MemorySpaceAssignment::CopyAllocation::Process() {
   // Copy allocations need to insert asynchronous copy nodes.
   Shape shape = defining_position().shape();
@@ -8491,8 +8608,15 @@ void MemorySpaceAssignment::ScheduleAsynchronousCopies() {
 
         // Accessing flattened_instructions_ here without checking if it is
         // nullptr is safe because this method is called before SimplifyGraph.
-        while (async_copy_step->defining_position().instruction->parent() !=
-               flattened_instructions_[copy_start_schedule_after]->parent()) {
+        while (
+            async_copy_step->defining_position().instruction->parent() !=
+            flattened_instructions_[
+                // We can't use -1 to index into flatten_instructions_. However,
+                // if we want to place the copy as first instruction, i.e.,
+                // after the -1 scheduling position, its parent will be the same
+                // as the first instruction, i.e., the one at the 0th position.
+                std::max<int64_t>(0, copy_start_schedule_after)]
+                ->parent()) {
           VLOG(4) << "Delaying CopyStart (" << copy_start_schedule_after
                   << " to " << (copy_start_schedule_after + 1) << ") for "
                   << start_phase->instruction->ToString()
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index 384c5bc46d985b..956d485333f133 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -717,22 +717,18 @@ class MemorySpaceAssignment {
   };
 
   // This class represents an allocation as a result of an asynchronous copy.
-  // Note: CopyStart instructions are inserted after `start_time` or later,
-  // while CopyDone instructions are inserted before
-  // `copy_done_schedule_before_time` or earlier.
+  // Note: CopyStart instructions are inserted after
+  // `copy_start_schedule_after`, while CopyDone instructions are inserted
+  // before `copy_done_schedule_before_time`.
   class CopyAllocation : public Allocation {
    public:
+    // TODO(b/307342076): Reorder scheduling times to be
+    // copy_start_schedule_after_time, copy_done_schedule_before_time, end_time
     CopyAllocation(
         Allocation& prev_allocation, MemorySpace memory_space,
-        std::optional<Chunk> chunk, int64_t start_time, int64_t end_time,
-        int64_t copy_done_schedule_before_time,
-        std::optional<int64_t> cross_program_prefetch_index = std::nullopt)
-        : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
-                     start_time, end_time, /*is_scoped_allocation=*/false),
-          prev_allocation_(prev_allocation),
-          copy_start_schedule_after_(start_time),
-          copy_done_schedule_before_(copy_done_schedule_before_time),
-          cross_program_prefetch_index_(cross_program_prefetch_index) {}
+        std::optional<Chunk> chunk, int64_t copy_start_schedule_after_time,
+        int64_t end_time, int64_t copy_done_schedule_before_time,
+        std::optional<int64_t> cross_program_prefetch_index = std::nullopt);
 
     bool is_copy_allocation() const override { return true; }
 
@@ -853,7 +849,7 @@ class MemorySpaceAssignment {
     bool operator==(const SliceDecision& other) const;
 
     Chunk chunk;
-    int64_t start_time;
+    int64_t exclusive_start_time;
     SliceProposal sizing;
     float copy_resource_consumed;
   };
@@ -1564,7 +1560,7 @@ struct Options {
 // time (time that copy done is scheduled), the resource this copy would use,
 // its destination memory space, and a unique ID.
 struct AsynchronousCopy {
-  int64_t start_time;
+  int64_t exclusive_start_time;
   int64_t end_time;
   float resource;
   MemorySpaceAssignment::MemorySpace destination;
@@ -1573,7 +1569,8 @@ struct AsynchronousCopy {
   std::tuple<int64_t, int64_t, float, MemorySpaceAssignment::MemorySpace,
              int64_t>
   AsTuple() const {
-    return std::make_tuple(start_time, end_time, resource, destination, id);
+    return std::make_tuple(exclusive_start_time, end_time, resource,
+                           destination, id);
   }
 };
 
@@ -1610,13 +1607,13 @@ class AsynchronousCopyOrdering {
   // The new asynchronous copy would violate the ordering guarantee because the
   // copy start is after an already committed asynchronous copy while its copy
   // done is before the committed copy.
-  bool ViolatesOrdering(int64_t start_time, int64_t end_time) const;
+  bool ViolatesOrdering(int64_t exclusive_start_time, int64_t end_time) const;
 
  private:
   // We use this data structure for keys into the map that has a custom
   // comparator for the ordering guarantees.
   struct Interval {
-    int64_t start_time;
+    int64_t exclusive_start_time;
     int64_t end_time;
 
     // We allow multiple prefetches that have one or both of the same start and
@@ -1625,8 +1622,10 @@ class AsynchronousCopyOrdering {
     // intervals that evaluate to be equal are those with the same start and end
     // times or those with intervals that violate the FIFO order.
     bool operator<(const Interval& other) const {
-      return (start_time < other.start_time && end_time <= other.end_time) ||
-             (start_time <= other.start_time && end_time < other.end_time);
+      return (exclusive_start_time < other.exclusive_start_time &&
+              end_time <= other.end_time) ||
+             (exclusive_start_time <= other.exclusive_start_time &&
+              end_time < other.end_time);
     }
   };
   // Stores asynchronous copies in a tree set respecting the pipelining order.
@@ -1642,7 +1641,7 @@ class AsynchronousCopyResource {
  public:
   // A specification of needed asynchronous copy resources.
   struct ResourceSpec {
-    int64_t start_time;
+    int64_t exclusive_start_time;
     int64_t end_time;
     float resource;
   };
@@ -1664,7 +1663,8 @@ class AsynchronousCopyResource {
 
   // Returns true if a copy with the given start and end times and resource can
   // be satisfied.
-  bool HasEnoughResource(int64_t start_time, int64_t end_time, float resource);
+  bool HasEnoughResource(int64_t exclusive_start_time, int64_t end_time,
+                         float resource);
 
   // Returns true if a set of copy specifications can be satisfied in the
   // order specified.
@@ -1693,7 +1693,7 @@ class AsynchronousCopyResource {
   // for any change to delay_[i], {i, delay_[i]} will be added to
   // delay_change_map, allowing callers to undo any modifications.
   bool ConsumeResource(
-      int64_t start_time, int64_t end_time, float resource,
+      int64_t exclusive_start_time, int64_t end_time, float resource,
       absl::flat_hash_map<int64_t, float>* delay_change_map = nullptr,
       float resource_to_free = 0.0);
 
@@ -2053,7 +2053,7 @@ class AlternateMemoryBestFitHeap
   // If earliest_prefetch_time is set, prefetches cannot start before this
   // value.
   struct AllocationRequest {
-    int64_t start_time;
+    int64_t inclusive_start_time;
     int64_t end_time;
     int64_t latest_prefetch_time;
     int64_t size;
@@ -2203,14 +2203,14 @@ class AlternateMemoryBestFitHeap
 
     // Intermediate calculations common to both the sliced and unsliced
     // solutions.
-    int64_t prefetch_start_time = -1;
+    int64_t exclusive_prefetch_start_time = -1;
     int64_t prefetch_end_time = -1;
     const Shape* full_shape;
     int64_t extra_async_copy_limit = 0;
     // As a compilation time optimization, store the prefetch start time where
     // we have first seen out of memory. There is no point of exploring prefetch
     // start times earlier than this point.
-    std::optional<int64_t> out_of_mem_start = std::nullopt;
+    std::optional<int64_t> exclusive_out_of_mem_start = std::nullopt;
 
     // Data structures used to compute and store the sliced solution.
     std::optional<MemorySpaceAssignment::SliceProposalCollection>
@@ -2487,7 +2487,7 @@ class AlternateMemoryBestFitHeap
   // copies. An extra  async copy limit can be provided to increase the limit of
   // asynchronous copies for this instance.
   bool ViolatesMaximumOutstandingAsyncCopies(
-      int64_t start_time, int64_t end_time, bool is_prefetch,
+      int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
       int64_t extra_async_copy_limit = 0,
       int64_t num_additional_copies = 1) const;
 
@@ -2512,8 +2512,9 @@ class AlternateMemoryBestFitHeap
   // Adds an asynchronous copy to allocations.
   void AddAsyncCopy(
       MemorySpaceAssignment::Allocation& prev_allocation,
-      MemorySpace memory_space, std::optional<Chunk> chunk, int64_t start_time,
-      int64_t end_time, int64_t copy_done_schedule_before_time,
+      MemorySpace memory_space, std::optional<Chunk> chunk,
+      int64_t exclusive_start_time, int64_t end_time,
+      int64_t copy_done_schedule_before_time,
       MemorySpaceAssignment::AllocationSequence* allocations,
       AliasedOffset* aliased_offset, float resource,
       std::optional<int> cross_program_prefetch_index = std::nullopt);
@@ -2563,12 +2564,11 @@ class AlternateMemoryBestFitHeap
     return options_.max_size_in_bytes - reserved_in_bytes_;
   }
 
-  // Returns the earliest time in the [start_time, end_time] range that a new
-  // allocation with the given size would fit in the alternate memory. If it
-  // doesn't fit, it returns nullopt.
-  std::optional<int> FindEarliestTimeToSatisfyPeakMemory(int start_time,
-                                                         int end_time,
-                                                         int64_t size) const;
+  // Returns the earliest time in the (exclusive_start_time, end_time) range
+  // that a new allocation with the given size would fit in the alternate
+  // memory. If it doesn't fit, it returns nullopt.
+  std::optional<int> FindEarliestExclusiveTimeToSatisfyPeakMemory(
+      int exclusive_start_time, int end_time, int64_t size) const;
 
   // Creates and returns a RepackAllocationBlock.
   static RepackAllocationBlock MakeRepackAllocationBlock(
@@ -2576,7 +2576,7 @@ class AlternateMemoryBestFitHeap
       int64_t initial_offset, int64_t id,
       MemorySpaceAssignment::Allocation* allocation) {
     RepackAllocationBlock allocation_block;
-    allocation_block.start_time = start_time;
+    allocation_block.inclusive_start_time = start_time;
     allocation_block.end_time = end_time;
     allocation_block.size = size;
     allocation_block.offset = -1;
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 381470ce021317..41a1e646f51cde 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -6389,11 +6389,13 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
         absl::StrAppend(&colocations_str, colocation->id, ", ");
         colocations.insert(colocation->id);
       }
-      VLOG(1) << "Alloc id: " << block->id << " time: [" << block->start_time
-              << ", " << block->end_time << "] size: " << block->size
+      VLOG(1) << "Alloc id: " << block->id << " time: ["
+              << block->inclusive_start_time << ", " << block->end_time
+              << "] size: " << block->size
               << " init offset: " << block->initial_offset << " colocations: {"
               << colocations_str << "}";
-      auto it = repack_map_.find({block->start_time, block->initial_offset});
+      auto it = repack_map_.find(
+          {block->inclusive_start_time, block->initial_offset});
       if (it != repack_map_.end()) {
         modified = true;
         block->offset = it->second;
@@ -6877,7 +6879,7 @@ TEST_P(MemorySpaceAssignmentTest, ScopedAllocationWithDifferentOffset) {
              allocations) {
         for (MemorySpaceAssignmentRepacker::AllocationBlock* block :
              allocations) {
-          if (block->start_time == block->end_time) {
+          if (block->inclusive_start_time == block->end_time) {
             EXPECT_GT(block->colocations.size(), 0);
           }
         }
@@ -8844,6 +8846,129 @@ ENTRY %main {
                          op::Fusion()));
 }
 
+// Test description:
+// - Setup: Make sure p1 can not be prefetched to alternate memory until after
+//   instruction c. We do this by causing p0 to be prefetched to alternate
+//   memory for use in c. Since p0 is larger than 1/2 of alternate memory, we
+//   will not be able to prefetch p1 until after p0 is unallocated.
+// - Test: prefetch p1, after p0 is unallocated from alternate memory (after
+//   instruction c).
+TEST_P(MemorySpaceAssignmentTest, CopyResourceIntegration) {
+  std::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY main {
+  p0 = s32[8,8] parameter(0)
+  p1 = s32[8,8] parameter(1)
+  p2 = s32[] parameter(2)
+  a = negate(p2)
+  b = negate(a)
+  c = add(p0, p0)
+  d = negate(b)
+  e = negate(d)
+  f = add(p1, p1)
+
+  ROOT result = tuple(e,c,f)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 300;
+
+  // Setup cost analysis so it takes 2 instructions to prefetch anything.
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module, options));
+  cost_analysis->SetOverrideForGetInstructionElapsed(
+      [](const HloInstruction& instruction) -> float { return 10.0; });
+  cost_analysis->SetOverrideForGetAsyncCopyElapsed(
+      [](const Shape& shape) -> float { return 20.0; });
+  options.cost_analysis = cost_analysis.get();
+  CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
+      CostAnalysisPrefetchIntervalPicker(
+          *cost_analysis, /*min_overlap_to_async_copy_ratio=*/0.8,
+          /*preferred_overlap_to_async_copy_ratio=*/1.5,
+          /*max_overlap_to_mem_size_async_copy_ratio=*/10.0,
+          /*mem_size_bytes=*/options.max_size_in_bytes));
+
+  // p0 has the highest priority, followed by p1, followed by everything else.
+  MemorySpaceAssignment::BufferIntervalCompare compare =
+      [](const MemorySpaceAssignment::BufferInterval& lhs,
+         const MemorySpaceAssignment::BufferInterval& rhs) -> bool {
+    auto lookup = [](const MemorySpaceAssignment::BufferInterval& x) {
+      // An arbitrary value that is greater than that for p0 and p1.
+      int priority = 100;
+      if (x.buffer->instruction()->name() == "p0") {
+        priority = 0;
+      } else if (x.buffer->instruction()->name() == "p1") {
+        priority = 1;
+      }
+      return std::make_tuple(priority, x.buffer->instruction()->name());
+    };
+
+    return lookup(lhs) < lookup(rhs);
+  };
+
+  // Run test.
+  AssignMemorySpace(module.get(), options, compare, &prefetch_interval_picker);
+
+  // - Make sure the setup occurred, i.e., that p0 is prefetched to alternate
+  //   memory for use by c.
+  // - Make sure p1 is prefetched.
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(_,
+                // p0 is prefetched to alternate memory for use by c.
+                op::Add(op::AsyncCopy(kAlternateMemorySpace,
+                                      kDefaultMemorySpace, op::Parameter(0)),
+                        op::AsyncCopy(kAlternateMemorySpace,
+                                      kDefaultMemorySpace, op::Parameter(0))),
+                // p1 is prefetched to alternate memory for use by f.
+                op::Add(op::AsyncCopy(kAlternateMemorySpace,
+                                      kDefaultMemorySpace, op::Parameter(1)),
+                        op::AsyncCopy(kAlternateMemorySpace,
+                                      kDefaultMemorySpace, op::Parameter(1)))));
+
+  // Check the schedule
+  const std::vector<HloInstruction*>& schedule =
+      module->schedule().sequence(module->entry_computation()).instructions();
+  auto find_schedule_index = [&schedule](std::string_view name) -> int {
+    for (int i = 0; i < schedule.size(); ++i) {
+      if (schedule[i]->name() == name) {
+        return i;
+      }
+    }
+    LOG(FATAL) << "Unable to find index of instruction with name " << name;
+  };
+  int c_index = find_schedule_index("c");
+  int p1_copy_start = find_schedule_index(module->entry_computation()
+                                              ->root_instruction()  // result
+                                              ->operand(2)          // f
+                                              ->operand(0)          // copy done
+                                              ->operand(0)  // copy start
+                                              ->name());
+  int d_index = find_schedule_index("d");
+  int e_index = find_schedule_index("e");
+  int p1_copy_end = find_schedule_index(module->entry_computation()
+                                            ->root_instruction()  // result
+                                            ->operand(2)          // f
+                                            ->operand(0)          // copy done
+                                            ->name());
+  int f_index = find_schedule_index("f");
+  // We expect to start copying p1 after c.
+  EXPECT_EQ(p1_copy_start, c_index + 1);
+  // d and e should follow come between p1's copy start and end.
+  EXPECT_EQ(d_index, p1_copy_start + 1);
+  EXPECT_EQ(e_index, d_index + 1);
+  EXPECT_EQ(p1_copy_end, e_index + 1);
+  // f should immediately follow the end of p1's copy.
+  EXPECT_EQ(f_index, p1_copy_end + 1);
+}
+
 using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
 
 TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
@@ -11752,29 +11877,28 @@ ENTRY main {
                          ShapeSize(f32_16_16)}),
       })));
 
-  // Force MSA to prefer prefetching (in order) p0, p1, p2, p3, p4, and then
+  // Force MSA to prefer prefetching (in order) p1, p2, p3, p4, and then
   // anything else.
   MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
-      [](const MemorySpaceAssignment::BufferInterval& a,
-         const MemorySpaceAssignment::BufferInterval& b) {
-        auto get_priority = [](const HloInstruction* instruction) {
-          if (instruction->name() == "p1") {
-            return 1;
-          }
-          if (instruction->name() == "p2") {
-            return 2;
+      [](const MemorySpaceAssignment::BufferInterval& lhs,
+         const MemorySpaceAssignment::BufferInterval& rhs) {
+        auto lookup = [](const MemorySpaceAssignment::BufferInterval& x) {
+          // An arbitrary value that is greater than that for p1, p2, p3, and
+          // p4.
+          int priority = 100;
+          if (x.buffer->instruction()->name() == "p1") {
+            priority = 1;
+          } else if (x.buffer->instruction()->name() == "p2") {
+            priority = 2;
+          } else if (x.buffer->instruction()->name() == "p3") {
+            priority = 3;
+          } else if (x.buffer->instruction()->name() == "p4") {
+            priority = 4;
           }
-          if (instruction->name() == "p3") {
-            return 3;
-          }
-          if (instruction->name() == "p4") {
-            return 4;
-          }
-          return 100;
+          return std::make_tuple(priority, x.buffer->instruction()->name());
         };
 
-        return get_priority(a.buffer->defining_instruction()) <
-               get_priority(b.buffer->defining_instruction());
+        return lookup(lhs) < lookup(rhs);
       };
 
   // Configure MSA.
@@ -11806,9 +11930,9 @@ ENTRY main {
         for (MockRepacker::AllocationBlock* block : allocations) {
           VLOG(1) << "Allocation block: " << block->ToString();
 
-          // Move "p2" from offset 1024 -> 2048.
-          if (block->start_time == 2 && block->initial_offset == 1024 &&
-              block->size == 2048) {
+          if (block->inclusive_start_time == 3 &&
+              block->initial_offset == 1024 && block->size == 2048) {
+            // Move "p2" from offset 1024 -> 2048.
             found_p2 = true;
             block->offset = 2048;
             // We expect p2 to be sliced. Check that it has slicing information
@@ -11816,23 +11940,26 @@ ENTRY main {
             EXPECT_TRUE(block->original_slice_data.has_value());
             if (block->original_slice_data.has_value()) {
               SlicedAllocationData expected(
-                  {{Slice{1024, 1024, 2}, Slice{1024, 2048, 6}}});
+                  {{Slice{1024, 1024, /*inclusive_start_time=*/3},
+                    Slice{1024, 2048, /*inclusive_start_time=*/7}}});
               EXPECT_EQ(*block->original_slice_data, expected)
                   << "\nExpected: " << expected.ToString()
                   << "\nGot: " << block->original_slice_data->ToString();
               // Set the first slice for p2 to be place at the larger offset.
               block->repacked_slice_data = SlicedAllocationData(
-                  {{Slice{1024, 2048, 6}, Slice{1024, 3072, 2}}});
+                  {{Slice{1024, 2048, /*inclusive_start_time=*/7},
+                    Slice{1024, 3072, /*inclusive_start_time=*/3}}});
             }
-          }
-          // Move "p3" from offset 3072 -> 1024.
-          if (block->start_time == 3 && block->initial_offset == 3072 &&
-              block->size == 1024) {
+          } else if (block->inclusive_start_time == 4 &&
+                     block->initial_offset == 3072 && block->size == 1024) {
+            // Move "p3" from offset 3072 -> 1024.
             found_p3 = true;
             block->offset = 1024;
             // We do not expect p3 to be sliced. Thus, it should not have
             // slicing information in its AllocationBlock.
             EXPECT_FALSE(block->original_slice_data.has_value());
+          } else {
+            block->offset = block->initial_offset;
           }
         }
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/repacking.h b/third_party/xla/xla/service/memory_space_assignment/repacking.h
index 0379fc79483230..1556bd390c1706 100644
--- a/third_party/xla/xla/service/memory_space_assignment/repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/repacking.h
@@ -43,15 +43,16 @@ class MemorySpaceAssignmentRepacker {
   struct Slice {
     int64_t size;
     int64_t offset;
-    int64_t start_time;
+    int64_t inclusive_start_time;
 
     std::string ToString() const {
       return absl::StrCat("{ size: ", size, ", offset: ", offset,
-                          ", start_time: ", start_time, " }");
+                          ", inclusive_start_time: ", inclusive_start_time,
+                          " }");
     }
 
     std::tuple<int64_t, int64_t, int64_t> ToTuple() const {
-      return std::make_tuple(size, offset, start_time);
+      return std::make_tuple(size, offset, inclusive_start_time);
     }
 
     bool operator==(const Slice& rhs) const {
@@ -73,15 +74,15 @@ class MemorySpaceAssignmentRepacker {
       return sizes_sorted_by_offset;
     }
 
-    std::vector<int64_t> SortedStartTimes() const {
-      std::vector<int64_t> sorted_start_times;
-      sorted_start_times.reserve(slices_sorted_by_offset.size());
-      absl::c_for_each(slices_sorted_by_offset,
-                       [&sorted_start_times](const Slice& slice) {
-                         sorted_start_times.push_back(slice.start_time);
-                       });
-      absl::c_sort(sorted_start_times);
-      return sorted_start_times;
+    std::vector<int64_t> SortedInclusiveStartTimes() const {
+      std::vector<int64_t> sorted_inclusive_start_times;
+      sorted_inclusive_start_times.reserve(slices_sorted_by_offset.size());
+      absl::c_for_each(slices_sorted_by_offset, [&sorted_inclusive_start_times](
+                                                    const Slice& slice) {
+        sorted_inclusive_start_times.push_back(slice.inclusive_start_time);
+      });
+      absl::c_sort(sorted_inclusive_start_times);
+      return sorted_inclusive_start_times;
     }
 
     std::string ToString() const {
@@ -113,7 +114,7 @@ class MemorySpaceAssignmentRepacker {
   // the information in the original_slice_data field to achieve an even more
   // efficient repacking.
   struct AllocationBlock {
-    int64_t start_time;
+    int64_t inclusive_start_time;
     int64_t end_time;
     int64_t size;
     int64_t offset;
@@ -137,8 +138,8 @@ class MemorySpaceAssignmentRepacker {
         repacked_slicing_str = absl::StrCat("; repacked_slice_data: ",
                                             repacked_slice_data->ToString());
       }
-      return absl::StrCat("[", start_time, ", ", end_time, "]; size: ", size,
-                          "; offset: ", offset,
+      return absl::StrCat("[", inclusive_start_time, ", ", end_time,
+                          "]; size: ", size, "; offset: ", offset,
                           "; initial offset: ", initial_offset,
                           "; # colocations: ", colocations.size(),
                           original_slicing_str, repacked_slicing_str);
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index 1fb197bcea8b82..59366639b84e2f 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -55,11 +55,15 @@ StatusOr<bool> ReduceScatterDecomposer::Run(
       }
 
       // Create an all-reduce
+      HloComputation *apply_clone = module->AddComputationAndUnifyNamesAndIds(
+          rs->to_apply()->Clone(), /*is_entry=*/false);
       HloInstruction *ar =
           computation->AddInstruction(HloInstruction::CreateAllReduce(
-              rs->operand(0)->shape(), rs->operands(), rs->to_apply(),
+              rs->operand(0)->shape(), rs->operands(), apply_clone,
               rs->replica_groups(), rs->constrain_layout(), channel_id,
               rs->use_global_device_ids()));
+      apply_clone->SetCollectiveCallInstruction(ar);
+
       // Create start indices for a dynamic slice to decompose the all-reduce
       // results.
       TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 429181dbecfe70..98438aa08ae251 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_pass_pipeline.h"
-#include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/service/spmd/custom_call_handler.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
@@ -4730,10 +4729,16 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           for (int64_t i = 0; i < num_replicas; ++i) {
             groups[i].add_replica_ids(i);
           }
-          return b->AddInstruction(HloInstruction::CreateAllReduce(
-              operand->shape(), {operand}, reduction, groups,
-              /*constrain_layout=*/false, channel_id,
-              /*use_global_device_ids=*/false));
+          HloComputation* reduction_clone =
+              reduction->parent()->AddComputationAndUnifyNamesAndIds(
+                  reduction->Clone(), false);
+          HloInstruction* all_reduce =
+              b->AddInstruction(HloInstruction::CreateAllReduce(
+                  operand->shape(), {operand}, reduction_clone, groups,
+                  /*constrain_layout=*/false, channel_id,
+                  /*use_global_device_ids=*/false));
+          reduction_clone->SetCollectiveCallInstruction(all_reduce);
+          return all_reduce;
         }
 
         std::vector<ReplicaGroup> device_groups;
@@ -4746,10 +4751,16 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
             }
           }
         }
-        return b->AddInstruction(HloInstruction::CreateAllReduce(
-            operand->shape(), {operand}, reduction, device_groups,
-            /*constrain_layout=*/false, channel_id,
-            /*use_global_device_ids=*/true));
+        HloComputation* reduction_clone =
+            reduction->parent()->AddComputationAndUnifyNamesAndIds(
+                reduction->Clone(), false);
+        HloInstruction* all_reduce =
+            b->AddInstruction(HloInstruction::CreateAllReduce(
+                operand->shape(), {operand}, reduction_clone, device_groups,
+                /*constrain_layout=*/false, channel_id,
+                /*use_global_device_ids=*/true));
+        reduction_clone->SetCollectiveCallInstruction(all_reduce);
+        return all_reduce;
       },
       [num_partitions](SpmdBuilder* b, HloInstruction* operand,
                        std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
diff --git a/third_party/xla/xla/service/time_utils.cc b/third_party/xla/xla/service/time_utils.cc
new file mode 100644
index 00000000000000..227193f2c6ebf4
--- /dev/null
+++ b/third_party/xla/xla/service/time_utils.cc
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/time_utils.h"
+
+#include <cstdint>
+
+namespace xla {
+
+int64_t ExclusiveToInclusiveStartTime(int64_t exclusive_time) {
+  return exclusive_time + 1;
+}
+
+int64_t InclusiveToExclusiveStartTime(int64_t inclusive_time) {
+  return inclusive_time - 1;
+}
+
+int64_t ExclusiveToInclusiveEndTime(int64_t exclusive_time) {
+  return exclusive_time - 1;
+}
+
+int64_t InclusiveToExclusiveEndTime(int64_t inclusive_time) {
+  return inclusive_time + 1;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/time_utils.h b/third_party/xla/xla/service/time_utils.h
new file mode 100644
index 00000000000000..c3ea7099d634cb
--- /dev/null
+++ b/third_party/xla/xla/service/time_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_TIME_UTILS_H_
+#define XLA_SERVICE_TIME_UTILS_H_
+
+#include <cstdint>
+
+namespace xla {
+
+// Convert between inclusive/exclusive start/end times.
+int64_t ExclusiveToInclusiveStartTime(int64_t exclusive_time);
+int64_t InclusiveToExclusiveStartTime(int64_t inclusive_time);
+int64_t ExclusiveToInclusiveEndTime(int64_t exclusive_time);
+int64_t InclusiveToExclusiveEndTime(int64_t inclusive_time);
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_TIME_UTILS_H_
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
index 1859aa9b059cd7..a1d08b35ae0a07 100644
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -15,7 +15,7 @@
 version: 2
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "f32[3,3]{1,0} custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+  hlo: "(f32[3,3]{1,0}, s8[36]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false}"
   result {
     gemm {
       algorithm: 13
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 54f44e444da237..052bd59330cdb3 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -450,6 +450,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -483,6 +484,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -520,6 +522,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -626,6 +629,7 @@ cc_library(
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
         "@local_tsl//tsl/util:env_var",
     ],
+    alwayslink = True,
 )
 
 # We have a separate `stream_executor_impl` target because in open source we are building multiple
diff --git a/third_party/xla/xla/stream_executor/blas.cc b/third_party/xla/xla/stream_executor/blas.cc
index efcda8daa46970..594acf41adacae 100644
--- a/third_party/xla/xla/stream_executor/blas.cc
+++ b/third_party/xla/xla/stream_executor/blas.cc
@@ -18,10 +18,32 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/strings/str_cat.h"
+#include "xla/stream_executor/device_memory.h"
 
 namespace stream_executor {
 namespace blas {
 
+// TODO(ezhulenev): We need a scoped thread local map-like container to make
+// sure that we can have multiple BlasSupport instances that do not overwrite
+// each others workspaces. For not it's ok as we know that this can't happen.
+static thread_local DeviceMemoryBase* workspace_thread_local = nullptr;
+
+BlasSupport::ScopedWorkspace::ScopedWorkspace(BlasSupport* blas,
+                                              DeviceMemoryBase* workspace)
+    : blas_(blas) {
+  blas->SetWorkspace(workspace);
+}
+
+BlasSupport::ScopedWorkspace::~ScopedWorkspace() { blas_->ResetWorkspace(); }
+
+DeviceMemoryBase* BlasSupport::GetWorkspace() { return workspace_thread_local; }
+
+void BlasSupport::SetWorkspace(DeviceMemoryBase* workspace) {
+  workspace_thread_local = workspace;
+}
+
+void BlasSupport::ResetWorkspace() { workspace_thread_local = nullptr; }
+
 std::string TransposeString(Transpose t) {
   switch (t) {
     case Transpose::kNoTranspose:
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index a62ec9a12f4c4a..0cdb0ddb854f3e 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -121,6 +121,16 @@ enum class ComputationType {
   kTF32AsF32,  // Allow downcast to TF32 precision.
 };
 
+// Call context information for GEMM API calls
+// This is extra information that can optionally be passed down to the blas
+// library, so that it can pick the efficient imlpementation based on context
+enum class CallContext {
+  kNone = 0,            // No information
+  kForward = 1,         // call happens in "forward" pass
+  kBackpropInput1 = 2,  // call happens in "backprop" pass for the first input
+  kBackpropInput2 = 4,  // call happens in "backprop" pass for the second input
+};
+
 // Converts a ComputationType to a string.
 std::string ComputationTypeString(ComputationType ty);
 
@@ -323,7 +333,8 @@ class BlasSupport {
                                  const DeviceMemoryBase &a, int lda,
                                  const DeviceMemoryBase &b, int ldb,
                                  const void *beta, DeviceMemoryBase *c, int ldc,
-                                 const NumericOptions &numeric_options) = 0;
+                                 const NumericOptions &numeric_options,
+                                 blas::CallContext context) = 0;
 
   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
@@ -348,8 +359,7 @@ class BlasSupport {
       DeviceMemoryBase *c, DataType type_c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
       const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result) = 0;
-
+      ProfileResult *output_profile_result, blas::CallContext context) = 0;
   virtual tsl::Status DoBlasGemmStridedBatchedWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, const void *alpha,
@@ -358,7 +368,7 @@ class BlasSupport {
       const void *beta, DeviceMemoryBase *c, DataType type_c, int ldc,
       int64_t stride_c, int batch_count, ComputationType computation_type,
       AlgorithmType algorithm, const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result) = 0;
+      ProfileResult *output_profile_result, blas::CallContext context) = 0;
 
   // Computes a batch of matrix-matrix product with general matrices.
   // This is a batched version of DoBlasGemm.
@@ -372,35 +382,30 @@ class BlasSupport {
                                  float beta, DeviceMemorySlice<Eigen::half> c,
                                  int ldc, int batch_count,
                                  const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) = 0;
-  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, float alpha,
-                                 DeviceMemorySlice<Eigen::bfloat16> a, int lda,
-                                 DeviceMemorySlice<Eigen::bfloat16> b, int ldb,
-                                 float beta,
-                                 DeviceMemorySlice<Eigen::bfloat16> c, int ldc,
-                                 int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) = 0;
-  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, float alpha,
-                                 DeviceMemorySlice<float> a, int lda,
-                                 DeviceMemorySlice<float> b, int ldb,
-                                 float beta, DeviceMemorySlice<float> c,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) = 0;
-  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, double alpha,
-                                 DeviceMemorySlice<double> a, int lda,
-                                 DeviceMemorySlice<double> b, int ldb,
-                                 double beta, DeviceMemorySlice<double> c,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) = 0;
+                                 ScratchAllocator *scratch_allocator,
+                                 blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64 k, float alpha,
+      DeviceMemorySlice<Eigen::bfloat16> a, int lda,
+      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
+      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64 k, float alpha, DeviceMemorySlice<float> a,
+      int lda, DeviceMemorySlice<float> b, int ldb, float beta,
+      DeviceMemorySlice<float> c, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64 k, double alpha,
+      DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,
+      int ldb, double beta, DeviceMemorySlice<double> c, int ldc,
+      int batch_count, const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, std::complex<float> alpha,
@@ -408,7 +413,7 @@ class BlasSupport {
       DeviceMemorySlice<std::complex<float>> b, int ldb,
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
       int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator) = 0;
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, std::complex<double> alpha,
@@ -416,8 +421,7 @@ class BlasSupport {
       DeviceMemorySlice<std::complex<double>> b, int ldb,
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
       int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator) = 0;
-
+      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
   // Batched gemm with strides instead of pointer arrays.
   virtual tsl::Status DoBlasGemmStridedBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
@@ -425,7 +429,7 @@ class BlasSupport {
       const DeviceMemoryBase &a, int lda, int64_t stride_a,
       const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
       DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-      const NumericOptions &numeric_options) = 0;
+      const NumericOptions &numeric_options, blas::CallContext context) = 0;
 
   // Solves a triangular matrix equation.
   //
@@ -490,12 +494,44 @@ class BlasSupport {
                                  DeviceMemory<std::complex<double> *> *bs,
                                  int ldb, int batch_count) = 0;
 
+  // TODO(ezhulenev): We should never pass ScratchAllocator to any of the APIs
+  // in this file, because it makes them incompatible with command buffers (CUDA
+  // graphs). We should pass workspace memory explicitly to all APIs. However
+  // this is a giant change, so currently we work around it by setting a thread
+  // local workspace and rely on `ScopedBlasWorkspace` RAII helper to reset it.
+  //
+  // APIs that get ScratchAllocator ignore this workspace, and continue
+  // allocating scratch memory on demand.
+  class ScopedWorkspace {
+   public:
+    ScopedWorkspace(BlasSupport *blas, DeviceMemoryBase *workspace);
+    ~ScopedWorkspace();
+
+   private:
+    BlasSupport *blas_;
+  };
+
   virtual tsl::Status GetVersion(std::string *version) = 0;
 
  protected:
+  DeviceMemoryBase *GetWorkspace();
+
   BlasSupport() {}
 
  private:
+  // Workspace memory pointer is thread local, once it is set all Blas
+  // operations issued from a caller thread might use it if it has large enough
+  // size. It's a user responsibility to make sure that workspace will outlive
+  // all issued BLAS operations.
+  //
+  // TODO(ezhulenev): This is a giant footgun! We have to remove it and use
+  // explicit workspace memory argument for all BLAS operations.
+  void SetWorkspace(DeviceMemoryBase *workspace);
+
+  // Resets user-defined workspace memory, so that Blas operations can use their
+  // own memory pool for allocating workspace.
+  void ResetWorkspace();
+
   BlasSupport(const BlasSupport &) = delete;
   void operator=(const BlasSupport &) = delete;
 };
@@ -576,7 +612,8 @@ class BlasSupport {
       uint64_t m, uint64 n, uint64 k, blas::DataType dtype, const void *alpha, \
       const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,  \
       const void *beta, DeviceMemoryBase *c, int ldc,                          \
-      const NumericOptions &numeric_options) override;                         \
+      const NumericOptions &numeric_options, blas::CallContext context)        \
+      override;                                                                \
   bool GetBlasGemmAlgorithms(Stream *stream,                                   \
                              std::vector<blas::AlgorithmType> *out_algorithms) \
       override;                                                                \
@@ -588,7 +625,8 @@ class BlasSupport {
       const void *beta, DeviceMemoryBase *c, blas::DataType type_c, int ldc,   \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result) override;                    \
+      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, float alpha,                             \
@@ -596,7 +634,8 @@ class BlasSupport {
       DeviceMemorySlice<Eigen::half> b, int ldb, float beta,                   \
       DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,              \
       const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, float alpha,                             \
@@ -604,21 +643,24 @@ class BlasSupport {
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,               \
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,          \
       const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, float alpha, DeviceMemorySlice<float> a, \
       int lda, DeviceMemorySlice<float> b, int ldb, float beta,                \
       DeviceMemorySlice<float> c, int ldc, int batch_count,                    \
       const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, double alpha,                            \
       DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,       \
       int ldb, double beta, DeviceMemorySlice<double> c, int ldc,              \
       int batch_count, const NumericOptions &numeric_options,                  \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, std::complex<float> alpha,               \
@@ -626,7 +668,8 @@ class BlasSupport {
       DeviceMemorySlice<std::complex<float>> b, int ldb,                       \
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,      \
       int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, std::complex<double> alpha,              \
@@ -634,14 +677,16 @@ class BlasSupport {
       DeviceMemorySlice<std::complex<double>> b, int ldb,                      \
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,    \
       int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator) override;                           \
+      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      override;                                                                \
   tsl::Status DoBlasGemmStridedBatched(                                        \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, blas::DataType dtype, const void *alpha, \
       const DeviceMemoryBase &a, int lda, int64_t stride_a,                    \
       const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,  \
       DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,         \
-      const NumericOptions &numeric_options) override;                         \
+      const NumericOptions &numeric_options, blas::CallContext context)        \
+      override;                                                                \
   tsl::Status DoBlasGemmStridedBatchedWithAlgorithm(                           \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, const void *alpha,                       \
@@ -651,7 +696,8 @@ class BlasSupport {
       blas::DataType type_c, int ldc, int64_t stride_c, int batch_count,       \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result) override;                    \
+      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      override;                                                                \
   bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
                   uint64_t n, float alpha, const DeviceMemory<float> &a,       \
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index b8be5cc7333fe4..e0493131aef216 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -263,6 +263,7 @@ cc_library(
         ":cuda_stream",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
@@ -365,18 +366,19 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@cudnn_frontend_archive//:cudnn_frontend",
         "@eigen_archive//:eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
-        "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/platform",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index a790acadbe8971..843b3470977b53 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -18,8 +18,11 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -179,6 +182,8 @@ static const char *const kCublasNotInitializedExplanation =
     "not built with support for the GPU in your machine.";
 
 bool CUDABlas::Init() {
+  absl::MutexLock lock(&mu_);
+
   gpu::ScopedActivateExecutorContext sac{parent_};
   cublasStatus_t ret = cublasCreate(&blas_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
@@ -222,6 +227,7 @@ bool CUDABlas::SetStream(Stream *stream) {
   CHECK(AsGpuStreamValue(stream) != nullptr);
   CHECK(blas_ != nullptr);
   gpu::ScopedActivateExecutorContext sac{parent_};
+
   cublasStatus_t ret = cublasSetStream(blas_, AsGpuStreamValue(stream));
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
@@ -358,6 +364,18 @@ tsl::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return tsl::errors::Internal("Failed setting stream");
   }
 
+  // Set workspace to a user-owned buffer, otherwise cuBlas will use its own
+  // memory pool, and it's not compatible with CUDA graphs.
+  if (auto *workspace = GetWorkspace();
+      workspace && workspace->opaque() && workspace->size() > 0) {
+    cublasStatus_t ret =
+        cublasSetWorkspace(blas_, workspace->opaque(), workspace->size());
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      return absl::InternalError(
+          absl::StrCat("Failed setting cuBlas workspace: ", ToString(ret)));
+    }
+  }
+
   ScopedCublasMathMode math_mode{blas_};
 #if CUBLAS_VER_MAJOR >= 11
   if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
@@ -583,7 +601,8 @@ tsl::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                                  const void *alpha, const DeviceMemoryBase &a,
                                  int lda, const DeviceMemoryBase &b, int ldb,
                                  const void *beta, DeviceMemoryBase *c, int ldc,
-                                 const NumericOptions &numeric_options) {
+                                 const NumericOptions &numeric_options,
+                                 blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 
 #if CUDA_VERSION < 11000
@@ -778,7 +797,7 @@ tsl::Status CUDABlas::DoBlasGemmWithAlgorithm(
     blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result) {
+    blas::ProfileResult *output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
       GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
@@ -812,7 +831,7 @@ tsl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
     DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result) {
+    blas::ProfileResult *output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
       GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
@@ -1106,7 +1125,8 @@ tsl::Status CUDABlas::DoBlasGemmBatchedInternal(
       DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
       TF_RETURN_IF_ERROR(DoBlasGemm(
           stream, transa, transb, m, n, k, blas::ToDataType<T>::value, &alpha,
-          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, numeric_options));
+          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, numeric_options,
+          blas::CallContext::kNone));
     }
     return ::tsl::OkStatus();
   }
@@ -1117,8 +1137,8 @@ bool CUDABlas::DoBlasGemmBatched(
     uint64_t n, uint64 k, float alpha, DeviceMemorySlice<Eigen::half> a_array,
     int lda, DeviceMemorySlice<Eigen::half> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
   tsl::Status status = DoBlasGemmBatchedInternal(
@@ -1137,8 +1157,8 @@ bool CUDABlas::DoBlasGemmBatched(
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of bf16 inside DoBlasGemmBatchedInternal.
   tsl::Status status = DoBlasGemmBatchedInternal(
@@ -1151,15 +1171,13 @@ bool CUDABlas::DoBlasGemmBatched(
   return status.ok();
 }
 
-bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, float alpha,
-                                 DeviceMemorySlice<float> a_array, int lda,
-                                 DeviceMemorySlice<float> b_array, int ldb,
-                                 float beta, DeviceMemorySlice<float> c_array,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) {
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<float> a_array,
+    int lda, DeviceMemorySlice<float> b_array, int ldb, float beta,
+    DeviceMemorySlice<float> c_array, int ldc, int batch_count,
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   tsl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
@@ -1170,18 +1188,17 @@ bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
   return status.ok();
 }
 
-bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, double alpha,
-                                 DeviceMemorySlice<double> a_array, int lda,
-                                 DeviceMemorySlice<double> b_array, int ldb,
-                                 double beta, DeviceMemorySlice<double> c_array,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) {
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64 k, double alpha, DeviceMemorySlice<double> a_array,
+    int lda, DeviceMemorySlice<double> b_array, int ldb, double beta,
+    DeviceMemorySlice<double> c_array, int ldc, int batch_count,
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   tsl::Status status = DoBlasGemmBatchedInternal(
       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1196,10 +1213,11 @@ bool CUDABlas::DoBlasGemmBatched(
     DeviceMemorySlice<std::complex<float>> b_array, int ldb,
     std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   tsl::Status status = DoBlasGemmBatchedInternal(
       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1214,7 +1232,7 @@ bool CUDABlas::DoBlasGemmBatched(
     DeviceMemorySlice<std::complex<double>> b_array, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   tsl::Status status = DoBlasGemmBatchedInternal(
       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
@@ -1231,7 +1249,7 @@ tsl::Status CUDABlas::DoBlasGemmStridedBatched(
     const DeviceMemoryBase &a, int lda, int64_t stride_a,
     const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
     DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-    const NumericOptions &numeric_options) {
+    const NumericOptions &numeric_options, blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #if CUDA_VERSION < 11000
   if (dtype == dnn::kHalf) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index 151a63eb9f18dd..3dc54f976edc0c 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index b3c820f9e845c3..32be2860fd6c7b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "Eigen/Core"  // from @eigen_archive
@@ -3624,12 +3625,16 @@ enum CudnnfMHAUid {
   MASK_ID,
   ZERO_VAL_ID,
   ONE_VAL_ID,
+  NEG_INFINITY_ID,
   ALPHA_SCALE_ID,
   DROPOUT_SCALE_ID,
+  SCALE_PROB_ID,
   Q_SEQLEN_ID,
   K_SEQLEN_ID,
   D_OFFSET_ID,
   D_SEED_ID,
+  S_SUM_ID,
+  d_Q_accum_ID,
   VIRTUAL_ID = 34857
 };
 
@@ -3688,10 +3693,10 @@ tsl::StatusOr<cudnn_frontend::Operation> CreateTernaryPwOp(
 }
 
 // Returns a cudnn tensor that's the output of the mask op
-tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskTensor(
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskFwdTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
-    std::shared_ptr<cudnn_frontend::Tensor> input_tensor) {
+    cudnn_frontend::Tensor& input_tensor) {
   std::vector<int64_t> mask_dim(dims.size(), 1);
   std::vector<int64_t> mask_stride(strides.size(), 1);
 
@@ -3703,7 +3708,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskTensor(
   // Create the mask output tensor
   TF_ASSIGN_OR_RETURN(
       auto mask_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 300,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 400,
                         dnn::DataType::kFloat, 1, -1,
                         /*is_virtual=*/true));
 
@@ -3715,7 +3720,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskTensor(
   // Create the mask op.
   auto mask_op = cudnn_frontend::OperationBuilder(
                      CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                     .setxDesc((*input_tensor))
+                     .setxDesc(input_tensor)
                      .setbDesc(mask_tensor)
                      .setyDesc(mask_out_tensor)
                      .setpwDesc(mask_desc)
@@ -3734,7 +3739,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskTensor(
 tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnScaleTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
-    std::shared_ptr<cudnn_frontend::Tensor> input_tensor) {
+    cudnn_frontend::Tensor& input_tensor) {
   std::vector<int64_t> scale_dims(dims.size(), 1);
   std::vector<int64_t> scale_strides(strides.size(), 1);
 
@@ -3747,12 +3752,13 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnScaleTensor(
           /*is_value*/ true));
   TF_ASSIGN_OR_RETURN(auto scale_desc,
                       CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-  TF_ASSIGN_OR_RETURN(auto tensor_alpha_scale_out,
-                      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 600, dtype,
-                                        1, -1, /* is_virtual */ true));
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_alpha_scale_out,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 200, dtype, 1,
+                        -1, /* is_virtual */ true));
 
   TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp((*input_tensor), tensor_alpha_scale,
+                      CreateBinaryPwOp(input_tensor, tensor_alpha_scale,
                                        tensor_alpha_scale_out, scale_desc));
   // Add scale to op list
   ops.push_back(std::move(scale_op));
@@ -3764,7 +3770,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnScaleTensor(
 tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
-    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, bool use_mask) {
+    cudnn_frontend::Tensor& input_tensor, bool use_mask) {
   // Create the bias tensor.
   TF_ASSIGN_OR_RETURN(
       auto bias_tensor,
@@ -3774,7 +3780,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
   dnn::DataType bias_out_type = use_mask ? dtype : dnn::DataType::kFloat;
   TF_ASSIGN_OR_RETURN(
       auto bias_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 200,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 300,
                         bias_out_type, 1, -1,
                         /*is_virtual=*/true));
 
@@ -3786,15 +3792,13 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
   // Create the bias op.
   auto bias_op = cudnn_frontend::OperationBuilder(
                      CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                     .setxDesc((*input_tensor))
+                     .setxDesc(input_tensor)
                      .setbDesc(bias_tensor)
                      .setyDesc(bias_out_tensor)
                      .setpwDesc(bias_desc)
                      .build();
 
   RETURN_MSG_IF_CUDNN_ERROR(bias_op);
-
-  RETURN_MSG_IF_CUDNN_ERROR(bias_out_tensor);
   // Add bias to op list
   ops.push_back(std::move(bias_op));
 
@@ -3805,8 +3809,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
 tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
-    std::shared_ptr<cudnn_frontend::Tensor> input_tensor,
-    bool is_virtual = false) {
+    cudnn_frontend::Tensor& input_tensor, bool is_virtual = false) {
   // softmax's typical computation is:
   // exp(input - reduce_max(input)) / reduce_sum(exp(input - reduce_max(input)))
   // We need to create each op and add it to the op list sequentially.
@@ -3823,25 +3826,26 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
   }
 
   // Softmax output should be float
-  cudnnDataType_t softmax_output_type = CUDNN_DATA_FLOAT;
+  dnn::DataType softmax_output_type = dnn::DataType::kFloat;
 
   // Create output tensor of the first max reduction.
   TF_ASSIGN_OR_RETURN(
       auto max_reduction_output_tensor,
       CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
-                        CudnnfMHAUid::VIRTUAL_ID + 400, dnn::DataType::kFloat,
+                        CudnnfMHAUid::VIRTUAL_ID + 500, dnn::DataType::kFloat,
                         1, -1, /*is_virtual=*/true));
 
   // Create the reduction descriptor
-  auto max_reduction_desc = cudnn_frontend::ReductionDescBuilder()
-                                .setComputeType(softmax_output_type)
-                                .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
-                                .build();
+  auto max_reduction_desc =
+      cudnn_frontend::ReductionDescBuilder()
+          .setComputeType(ToCudnnDataType(softmax_output_type))
+          .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+          .build();
 
   // Create a reduction max node.
   auto max_reduction_op = cudnn_frontend::OperationBuilder(
                               CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc((*input_tensor))
+                              .setxDesc(input_tensor)
                               .setyDesc(max_reduction_output_tensor)
                               .setreductionDesc(max_reduction_desc)
                               .build();
@@ -3850,56 +3854,45 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
   // Create output tensor of the subtraction op.
   TF_ASSIGN_OR_RETURN(
       auto subtract_output_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 401,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 501,
                         dnn::DataType::kFloat, 1, -1,
                         /*is_virtual=*/true));
   // Create the subtraction descriptor
-  auto subtract_desc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_SUB)
-                           .setComputeType(softmax_output_type)
-                           .build();
+  TF_ASSIGN_OR_RETURN(auto subtract_desc,
+                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_SUB));
 
   // Create a subtraction node.
-  auto subtract_op = cudnn_frontend::OperationBuilder(
-                         CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                         .setxDesc((*input_tensor))
-                         .setbDesc(max_reduction_output_tensor)
-                         .setyDesc(subtract_output_tensor)
-                         .setpwDesc(subtract_desc)
-                         .build();
-  RETURN_MSG_IF_CUDNN_ERROR(subtract_op);
+  TF_ASSIGN_OR_RETURN(
+      auto subtract_op,
+      CreateBinaryPwOp(input_tensor, max_reduction_output_tensor,
+                       subtract_output_tensor, subtract_desc));
   // Create output tensor of the exp op.
   TF_ASSIGN_OR_RETURN(
       auto exp_output_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 402,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 502,
                         dnn::DataType::kFloat, 1, -1,
                         /*is_virtual=*/true));
   // Create the exponetial descriptor
-  auto exp_desc = cudnn_frontend::PointWiseDescBuilder()
-                      .setMode(CUDNN_POINTWISE_EXP)
-                      .setComputeType(softmax_output_type)
-                      .build();
+  TF_ASSIGN_OR_RETURN(auto exp_desc,
+                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_EXP));
 
   // Create a exponetial node.
-  auto exp_op = cudnn_frontend::OperationBuilder(
-                    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                    .setxDesc(subtract_output_tensor)
-                    .setyDesc(exp_output_tensor)
-                    .setpwDesc(exp_desc)
-                    .build();
-  RETURN_MSG_IF_CUDNN_ERROR(exp_op);
+  TF_ASSIGN_OR_RETURN(
+      auto exp_op,
+      CreateUnaryPwOp(subtract_output_tensor, exp_output_tensor, exp_desc));
 
   // Create output tensor of the sum reduction.
   TF_ASSIGN_OR_RETURN(
       auto sum_reduction_output_tensor,
       CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
-                        CudnnfMHAUid::VIRTUAL_ID + 403, dnn::DataType::kFloat,
+                        CudnnfMHAUid::VIRTUAL_ID + 503, dnn::DataType::kFloat,
                         1, -1, /*is_virtual=*/true));
   // Create the reduction descriptor
-  auto sum_reduction_desc = cudnn_frontend::ReductionDescBuilder()
-                                .setComputeType(softmax_output_type)
-                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-                                .build();
+  auto sum_reduction_desc =
+      cudnn_frontend::ReductionDescBuilder()
+          .setComputeType(ToCudnnDataType(softmax_output_type))
+          .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+          .build();
 
   // Create a reduction sum node.
   auto sum_reduction_op = cudnn_frontend::OperationBuilder(
@@ -3911,7 +3904,7 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
   RETURN_MSG_IF_CUDNN_ERROR(sum_reduction_op);
 
   // Create output tensor of the divide op.
-  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 404 : CudnnfMHAUid::P_ID;
+  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 504 : CudnnfMHAUid::P_ID;
   TF_ASSIGN_OR_RETURN(
       auto divide_output_tensor,
       CreateCudnnTensor(
@@ -3919,22 +3912,14 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
           /*is_virtual*/ is_virtual,
           /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
   // Create the divide descriptor
-  auto divide_desc = cudnn_frontend::PointWiseDescBuilder()
-                         .setMode(CUDNN_POINTWISE_DIV)
-                         .setComputeType(softmax_output_type)
-                         .build();
+  TF_ASSIGN_OR_RETURN(auto divide_desc,
+                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_DIV));
 
   // Create a divide node.
-  auto divide_op = cudnn_frontend::OperationBuilder(
-                       CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                       .setxDesc(exp_output_tensor)
-                       .setbDesc(sum_reduction_output_tensor)
-                       .setyDesc(divide_output_tensor)
-                       .setpwDesc(divide_desc)
-                       .build();
-  RETURN_MSG_IF_CUDNN_ERROR(divide_op);
-
-  RETURN_MSG_IF_CUDNN_ERROR(divide_output_tensor);
+  TF_ASSIGN_OR_RETURN(
+      auto divide_op,
+      CreateBinaryPwOp(exp_output_tensor, sum_reduction_output_tensor,
+                       divide_output_tensor, divide_desc));
 
   // Add max reduction to op list
   ops.push_back(std::move(max_reduction_op));
@@ -3951,11 +3936,11 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
 }
 
 // Returns a cudnn tensor that's the output of the dropout op
-tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutTensor(
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutFwdTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
-    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, double dropout_rate,
-    int64_t seed, bool is_virtual = false) {
+    cudnn_frontend::Tensor& input_tensor, double dropout_rate, int64_t seed,
+    bool is_virtual = false) {
   // Create scale tensor
   std::vector<int64_t> scale_dims(dims.size(), 1);
   std::vector<int64_t> scale_strides(strides.size(), 1);
@@ -3963,11 +3948,11 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutTensor(
   // Create tensor for dropout's mask.
   TF_ASSIGN_OR_RETURN(
       auto mask_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 500,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 600,
                         dnn::DataType::kFloat, 1, -1,
                         /*is_virtual*/ true));
   // Create output tensor of dropout node
-  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 501 : CudnnfMHAUid::P_ID;
+  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 601 : CudnnfMHAUid::P_ID;
   TF_ASSIGN_OR_RETURN(
       auto dropout_out_tensor,
       CreateCudnnTensor(
@@ -4010,19 +3995,13 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutTensor(
   RETURN_MSG_IF_CUDNN_ERROR(rng_op);
 
   // Create the masking node desc after mask tensor
-  auto masking_desc = cudnn_frontend::PointWiseDescBuilder()
-                          .setMode(CUDNN_POINTWISE_MUL)
-                          .setComputeType(CUDNN_DATA_FLOAT)
-                          .build();
+  TF_ASSIGN_OR_RETURN(auto masking_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
 
-  auto masking_op = cudnn_frontend::OperationBuilder(
-                        CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                        .setxDesc((*input_tensor))
-                        .setbDesc(mask_tensor)
-                        .setyDesc(dropout_out_tensor)
-                        .setpwDesc(masking_desc)
-                        .build();
-  RETURN_MSG_IF_CUDNN_ERROR(masking_op);
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto masking_op,
+                      CreateBinaryPwOp(input_tensor, mask_tensor,
+                                       dropout_out_tensor, masking_desc));
 
   TF_ASSIGN_OR_RETURN(
       auto dropout_scale_tensor,
@@ -4036,24 +4015,16 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutTensor(
   // Create output of scale node
   TF_ASSIGN_OR_RETURN(
       auto dropout_scale_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 502, dtype, 1,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 602, dtype, 1,
                         -1, /*is_virtual*/ true));
   // Create the scaling desc
-  auto scale_desc = cudnn_frontend::PointWiseDescBuilder()
-                        .setMode(CUDNN_POINTWISE_MUL)
-                        .setComputeType(CUDNN_DATA_FLOAT)
-                        .build();
-  // Create the scaling op
-  auto scale_op = cudnn_frontend::OperationBuilder(
-                      CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                      .setxDesc(dropout_out_tensor)
-                      .setbDesc(dropout_scale_tensor)
-                      .setyDesc(dropout_scale_out_tensor)
-                      .setpwDesc(scale_desc)
-                      .build();
-  RETURN_MSG_IF_CUDNN_ERROR(scale_op);
+  TF_ASSIGN_OR_RETURN(auto scale_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
 
-  RETURN_MSG_IF_CUDNN_ERROR(dropout_scale_out_tensor);
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto scale_op,
+                      CreateBinaryPwOp(dropout_out_tensor, dropout_scale_tensor,
+                                       dropout_scale_out_tensor, scale_desc));
   // Add rng op to op list
   ops.push_back(std::move(rng_op));
   // Add masking op to op list
@@ -5172,10 +5143,7 @@ GetCudnnFusedMHAOperationGraph(
       auto tensor_k,
       CreateCudnnTensor(bmm1_rhs_dims, bmm1_rhs_strides, CudnnfMHAUid::K_ID,
                         bmm1_rhs_descriptor.type(), 1, -1));
-  VLOG(4) << "\nTensor_k: " << tensor_k.describe();
 
-  std::shared_ptr<cudnn_frontend::Tensor> bmm2_input_tensor =
-      std::make_shared<cudnn_frontend::Tensor>(std::move(tensor_k));
   std::vector<int64_t> intermediate_bmm2_lhs_dims =
       intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
   std::vector<int64_t> intermediate_bmm2_lhs_strides =
@@ -5205,7 +5173,8 @@ GetCudnnFusedMHAOperationGraph(
   TF_ASSIGN_OR_RETURN(
       auto alpha_scale_out,
       CreateCudnnScaleTensor(intermediate_ops, bmm1_rhs_dims, bmm1_rhs_strides,
-                             bmm1_rhs_descriptor.type(), bmm2_input_tensor));
+                             bmm1_rhs_descriptor.type(), tensor_k));
+
   auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
                        .setComputeType(CUDNN_DATA_FLOAT)
                        .build();
@@ -5222,8 +5191,7 @@ GetCudnnFusedMHAOperationGraph(
   VLOG(4) << "\nTensor_s: " << tensor_s.describe()
           << "\nBMM1_op: " << bmm1_op.describe();
 
-  bmm2_input_tensor =
-      std::make_shared<cudnn_frontend::Tensor>(std::move(tensor_s));
+  cudnn_frontend::Tensor bmm2_input_tensor = std::move(tensor_s);
   intermediate_ops.push_back(std::move(bmm1_op));
 
   if (is_s_virtual) {
@@ -5236,19 +5204,17 @@ GetCudnnFusedMHAOperationGraph(
                                 intermediate_bmm2_lhs_strides,
                                 (*bias_descriptor).type(), bmm2_input_tensor,
                                 use_mask));
-      bmm2_input_tensor =
-          std::make_shared<cudnn_frontend::Tensor>(std::move(bias_out));
+      bmm2_input_tensor = std::move(bias_out);
     }
     if (use_mask) {
       // Create mask op and tensor
       TF_ASSIGN_OR_RETURN(
           auto mask_out,
-          CreateCudnnMaskTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
-                                intermediate_bmm2_lhs_strides,
-                                intermediate_bmm2_lhs_descriptor.type(),
-                                bmm2_input_tensor));
-      bmm2_input_tensor =
-          std::make_shared<cudnn_frontend::Tensor>(std::move(mask_out));
+          CreateCudnnMaskFwdTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
+                                   intermediate_bmm2_lhs_strides,
+                                   intermediate_bmm2_lhs_descriptor.type(),
+                                   bmm2_input_tensor));
+      bmm2_input_tensor = std::move(mask_out);
     }
     if (kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT || use_bias ||
         use_dropout || use_mask) {
@@ -5262,22 +5228,20 @@ GetCudnnFusedMHAOperationGraph(
                               intermediate_bmm2_lhs_descriptor.type(),
                               /*input_tensor*/ bmm2_input_tensor,
                               /*is_virtual*/ !should_output_softmax));
-      bmm2_input_tensor =
-          std::make_shared<cudnn_frontend::Tensor>(std::move(softmax_fwd_out));
+      bmm2_input_tensor = std::move(softmax_fwd_out);
     }
 
     if (use_dropout) {
       // Create dropout tensor
       bool dropout_virtual = (activation_descriptor == std::nullopt);
       TF_ASSIGN_OR_RETURN(auto dropout_out,
-                          CreateCudnnDropoutTensor(
+                          CreateCudnnDropoutFwdTensor(
                               intermediate_ops, intermediate_bmm2_lhs_dims,
                               intermediate_bmm2_lhs_strides,
                               intermediate_bmm2_lhs_descriptor.type(),
                               /*input_tensor*/ bmm2_input_tensor, *dropout_rate,
                               *seed, /*is_virtual*/ dropout_virtual));
-      bmm2_input_tensor =
-          std::make_shared<cudnn_frontend::Tensor>(std::move(dropout_out));
+      bmm2_input_tensor = std::move(dropout_out);
     }
   }
   std::vector<int64_t> bmm2_rhs_dims =
@@ -5311,7 +5275,7 @@ GetCudnnFusedMHAOperationGraph(
   RETURN_MSG_IF_CUDNN_ERROR(bmm2_desc);
   auto bmm2_op = cudnn_frontend::OperationBuilder(
                      CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                     .setaMatDesc((*bmm2_input_tensor))
+                     .setaMatDesc(bmm2_input_tensor)
                      .setbMatDesc(tensor_v)
                      .setcMatDesc(tensor_o)
                      .setmatmulDesc(bmm2_desc)
@@ -5728,13 +5692,12 @@ GetCudnnFusedMHABackwardOperationGraph(
     const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm2_grad_gemm2_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& d_output_descriptor,
-    const dnn::TensorDescriptor& d_s_descriptor,
     const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
     const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
     const dnn::TensorDescriptor& d_bmm2_rhs_descriptor, dnn::FusedMHAKind kind,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    CudnnHandle& cudnn, double scale, bool use_dropout = false,
-    bool use_mask = false, bool use_bias = false) {
+    CudnnHandle& cudnn, double scale, std::vector<int64_t>& intermediate_shape,
+    bool use_dropout = false, bool use_mask = false, bool use_bias = false) {
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\n bmm1_grad_gemm1_rhs(q): "
             << bmm1_grad_gemm1_rhs_descriptor.ToString()
@@ -5791,11 +5754,15 @@ GetCudnnFusedMHABackwardOperationGraph(
       auto tensor_k,
       CreateCudnnTensor(k_dims, k_strides, CudnnfMHAUid::K_ID, dtype, 1, -1));
 
+  // P^T is lhs of bmm2grad1 dV = dot(P^T, dO) so we set is_lhs = false here to
+  // get correct P dim and stride
   std::vector<int64_t> p_dims =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
   std::vector<int64_t> p_strides =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleStrides(true);
+      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleStrides(false);
 
+  // used for calculate offset increment
+  intermediate_shape = p_dims;
   VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm1_lhs_dims: "
           << absl::StrJoin(p_dims, ",")
           << "\n cuDNN compatible bmm2_grad_gemm1_lhs_strides: "
@@ -5868,95 +5835,1318 @@ GetCudnnFusedMHABackwardOperationGraph(
           << "\n cuDNN compatible d_bmm2_rhs_strides: "
           << absl::StrJoin(dv_strides, ",");
 
-  TF_ASSIGN_OR_RETURN(auto tensor_dv,
-                      CreateCudnnTensor(dv_dims, dv_strides,
-                                        CudnnfMHAUid::dV_ID, dtype, 1, -1));
+  TF_ASSIGN_OR_RETURN(auto tensor_dv,
+                      CreateCudnnTensor(dv_dims, dv_strides,
+                                        CudnnfMHAUid::dV_ID, dtype, 1, -1));
+
+  // reshape + scale dropout + abs for p
+  auto p_transpose_dims = p_dims;
+  auto p_transpose_strides = p_strides;
+  auto rank = p_transpose_dims.size();
+  std::swap(p_transpose_dims[rank - 1], p_transpose_dims[rank - 2]);
+  std::swap(p_transpose_strides[rank - 1], p_transpose_strides[rank - 2]);
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_p_transpose,
+      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                        CudnnfMHAUid::VIRTUAL_ID + 300, dtype, 1, -1,
+                        /* is_virtual */ true));
+
+  auto reshape_op = cudnn_frontend::OperationBuilder(
+                        CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                        .setxDesc(tensor_p)
+                        .setyDesc(tensor_p_transpose)
+                        .build();
+  RETURN_MSG_IF_CUDNN_ERROR(reshape_op);
+
+  // Create scale tensor
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dropout_scale,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  // Create output of scale
+  TF_ASSIGN_OR_RETURN(auto tensor_p_transpose_scale,
+                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                                        CudnnfMHAUid::VIRTUAL_ID + 301, dtype,
+                                        1, -1, /*is_virtual*/ true));
+  // Create the scaling desc
+  TF_ASSIGN_OR_RETURN(auto scale_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto scale_op,
+                      CreateBinaryPwOp(tensor_p_transpose, tensor_dropout_scale,
+                                       tensor_p_transpose_scale, scale_desc));
+  // create abs operation here to clear the sign bit
+  // sign bit is used to store the mask for dropout
+  TF_ASSIGN_OR_RETURN(auto tensor_p_transpose_scale_abs,
+                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                                        CudnnfMHAUid::VIRTUAL_ID + 302, dtype,
+                                        1, -1, /*is_virtual*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto abs_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
+
+  TF_ASSIGN_OR_RETURN(auto abs_op,
+                      CreateUnaryPwOp(tensor_p_transpose_scale,
+                                      tensor_p_transpose_scale_abs, abs_desc));
+
+  intermediate_ops.push_back(std::move(reshape_op));
+  intermediate_ops.push_back(std::move(scale_op));
+  intermediate_ops.push_back(std::move(abs_op));
+
+  // matmul to calculate dv
+  auto bmm2_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_desc);
+
+  auto bmm2_grad_gemm1_op = cudnn_frontend::OperationBuilder(
+                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                                .setaMatDesc(tensor_p_transpose_scale_abs)
+                                .setbMatDesc(tensor_do)
+                                .setcMatDesc(tensor_dv)
+                                .setmatmulDesc(bmm2_grad_gemm1_desc)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_op);
+  VLOG(4) << "\nBMM2_grad_gemm1: " << bmm2_grad_gemm1_desc.describe()
+          << "\nBMM2_grad_gemm1_op: " << bmm2_grad_gemm1_op.describe();
+
+  intermediate_ops.push_back(std::move(bmm2_grad_gemm1_op));
+
+  // matmul to calculate dp
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dp,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 303,
+                        dnn::DataType::kFloat, 1,
+                        -1,  // FMHA TODO TYPE: why it is float here?
+                        /* is_virtual */ true));
+
+  auto bmm2_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_desc);
+  auto bmm2_grad_gemm2_op = cudnn_frontend::OperationBuilder(
+                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                                .setaMatDesc(tensor_do)
+                                .setbMatDesc(tensor_vt)
+                                .setcMatDesc(tensor_dp)
+                                .setmatmulDesc(bmm2_grad_gemm2_desc)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_op);
+  VLOG(4) << "\nBMM2_grad_gemm2: " << bmm2_grad_gemm2_desc.describe()
+          << "\nBMM2_grad_gemm2_op: " << bmm2_grad_gemm2_op.describe();
+
+  intermediate_ops.push_back(std::move(bmm2_grad_gemm2_op));
+
+  // mask out the sign bit here
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_p_abs,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 304,
+                        dtype, 1, -1,
+                        /* is_virtual */ true));
+
+  TF_ASSIGN_OR_RETURN(auto p_abs_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
+
+  TF_ASSIGN_OR_RETURN(auto p_abs_op,
+                      CreateUnaryPwOp(tensor_p, tensor_p_abs, p_abs_desc));
+  intermediate_ops.push_back(std::move(p_abs_op));
+
+  // dropout backward
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dp_scale_dropout,
+      CreateCudnnDropoutBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
+                                  tensor_dropout_scale, tensor_p, tensor_p_abs,
+                                  tensor_dp));
+  // softmax backward
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_ds,
+      CreateCudnnSoftmaxBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
+                                  tensor_p_abs, tensor_dp_scale_dropout));
+  // mask backward
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_ds_mask,
+      CreateCudnnMaskBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
+                               tensor_ds, use_mask));
+
+  // bias backward
+  if (use_bias) {
+#if (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
+    TF_ASSIGN_OR_RETURN(
+        auto tensor_dbias,
+        CreateCudnnBiasBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
+                                 tensor_ds_mask));
+#else
+    return absl::InternalError("Bias backward op requires cudnn >= 8.9.1");
+#endif
+  }
+
+  // calculate dq
+  auto bmm1_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_desc);
+  auto bmm1_grad_gemm2_op = cudnn_frontend::OperationBuilder(
+                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                                .setaMatDesc(tensor_ds_mask)
+                                .setbMatDesc(tensor_k)
+                                .setcMatDesc(tensor_dq)
+                                .setmatmulDesc(bmm1_grad_gemm2_desc)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_op);
+  VLOG(4) << "\nBMM1_grad_gemm2: " << bmm1_grad_gemm2_desc.describe()
+          << "\nBMM1_grad_gemm2_op: " << bmm1_grad_gemm2_op.describe();
+
+  intermediate_ops.push_back(std::move(bmm1_grad_gemm2_op));
+
+  // calculate dk
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_ds_mask_reshape,
+      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                        CudnnfMHAUid::VIRTUAL_ID + 305, dtype, 1, -1,
+                        /* is_virtual */ true));
+
+  auto reshape_2_op = cudnn_frontend::OperationBuilder(
+                          CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(tensor_ds_mask)
+                          .setyDesc(tensor_ds_mask_reshape)
+                          .build();
+
+  intermediate_ops.push_back(std::move(reshape_2_op));
+  auto bmm1_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_desc);
+  auto bmm1_grad_gemm1_op = cudnn_frontend::OperationBuilder(
+                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                                .setaMatDesc(tensor_ds_mask_reshape)
+                                .setbMatDesc(tensor_q)
+                                .setcMatDesc(tensor_dk)
+                                .setmatmulDesc(bmm1_grad_gemm1_desc)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_op);
+  VLOG(4) << "\nBMM1_grad_gemm1: " << bmm1_grad_gemm1_desc.describe()
+          << "\nBMM1_grad_gemm1_op: " << bmm1_grad_gemm1_op.describe();
+
+  intermediate_ops.push_back(std::move(bmm1_grad_gemm1_op));
+  ops.reserve(intermediate_ops.size());
+
+  for (auto& intermediate_op : intermediate_ops) {
+    ops.emplace_back(&intermediate_op);
+  }
+
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(cudnn.handle())
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
+
+  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
+          << "\nTensor_k: " << tensor_k.describe()
+          << "\nTensor_p: " << tensor_p.describe()
+          << "\nTensor_vt: " << tensor_vt.describe()
+          << "\nTensor_do: " << tensor_do.describe()
+          << "\nTensor_dq: " << tensor_dq.describe()
+          << "\nTensor_dk: " << tensor_dk.describe()
+          << "\nTensor_dv: " << tensor_dv.describe()
+          << "\nOpGraph: " << op_graph.describe();
+  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
+}
+
+// Returns a cudnn tensor that's the output of the bias addition op
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionBiasFwdTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    cudnn_frontend::Tensor& input_tensor) {
+  // Create the bias tensor.
+  TF_ASSIGN_OR_RETURN(
+      auto bias_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::BIAS_ID, dtype, 1, -1));
+
+  // Create the bias output tensor
+  TF_ASSIGN_OR_RETURN(
+      auto bias_out_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 300,
+                        dnn::DataType::kFloat, 1, -1, /*is_virtual=*/true));
+
+  // Define the bias descriptor
+  auto bias_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(CUDNN_POINTWISE_ADD)
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  // Create the bias op.
+  auto bias_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                     .setxDesc(input_tensor)
+                     .setbDesc(bias_tensor)
+                     .setyDesc(bias_out_tensor)
+                     .setpwDesc(bias_desc)
+                     .build();
+
+  RETURN_MSG_IF_CUDNN_ERROR(bias_op);
+  // Add bias to op list
+  ops.push_back(std::move(bias_op));
+
+  return bias_out_tensor;
+}
+
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionCausalMaskTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    cudnn_frontend::Tensor& input_tensor) {
+  std::vector<int64_t> mask_dim(dims.size(), 1);
+  std::vector<int64_t> mask_stride(strides.size(), 1);
+
+  // Create the masked out value tensor.
+  TF_ASSIGN_OR_RETURN(
+      auto masked_val_tensor,
+      CreateCudnnTensor(
+          mask_dim, mask_stride, CudnnfMHAUid::NEG_INFINITY_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  // Create the row index tensor
+  TF_ASSIGN_OR_RETURN(
+      auto row_index_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 401,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+
+  // Create the column index tensor
+  TF_ASSIGN_OR_RETURN(
+      auto column_index_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 402,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+
+  // Create the causal mask tensor
+  auto causal_mask_tensor = cudnn_frontend::TensorBuilder()
+                                .setDim(dims.size(), dims.data())
+                                .setStride(strides.size(), strides.data())
+                                .setId(CudnnfMHAUid::VIRTUAL_ID + 403)
+                                .setAlignment(16)
+                                .setDataType(CUDNN_DATA_BOOLEAN)
+                                .setVectorCountAndDimension(1, -1)
+                                .setVirtual(true)
+                                .setReorderType(CUDNN_TENSOR_REORDERING_NONE)
+                                .setByValue(false)
+                                .build();
+
+  // Create the mask output tensor
+  TF_ASSIGN_OR_RETURN(
+      auto mask_out_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 400,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+
+  auto gen_index_row_desc = cudnn_frontend::PointWiseDescBuilder()
+                                .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                .setAxis(2)
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(gen_index_row_desc);
+
+  TF_ASSIGN_OR_RETURN(
+      auto gen_index_row_op,
+      CreateUnaryPwOp(input_tensor, row_index_tensor, gen_index_row_desc));
+
+  auto gen_index_column_desc = cudnn_frontend::PointWiseDescBuilder()
+                                   .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                   .setAxis(3)
+                                   .setComputeType(CUDNN_DATA_FLOAT)
+                                   .build();
+  RETURN_MSG_IF_CUDNN_ERROR(gen_index_column_desc);
+
+  TF_ASSIGN_OR_RETURN(auto gen_index_column_op,
+                      CreateUnaryPwOp(input_tensor, column_index_tensor,
+                                      gen_index_column_desc));
+
+  auto row_greater_than_column_desc = cudnn_frontend::PointWiseDescBuilder()
+                                          .setMode(CUDNN_POINTWISE_CMP_GE)
+                                          .setComputeType(CUDNN_DATA_BOOLEAN)
+                                          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(row_greater_than_column_desc);
+
+  TF_ASSIGN_OR_RETURN(
+      auto row_greater_than_column_op,
+      CreateBinaryPwOp(row_index_tensor, column_index_tensor,
+                       causal_mask_tensor, row_greater_than_column_desc));
+
+  TF_ASSIGN_OR_RETURN(
+      auto mask_desc,
+      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_BINARY_SELECT));
+
+  // Create the mask op.
+  TF_ASSIGN_OR_RETURN(
+      auto mask_op,
+      CreateTernaryPwOp(input_tensor, masked_val_tensor, causal_mask_tensor,
+                        mask_out_tensor, mask_desc));
+
+  // Add mask to op list
+  ops.push_back(std::move(gen_index_row_op));
+  ops.push_back(std::move(gen_index_column_op));
+  ops.push_back(std::move(row_greater_than_column_op));
+  ops.push_back(std::move(mask_op));
+
+  return mask_out_tensor;
+}
+
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionSoftmaxFwdTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    cudnn_frontend::Tensor& input_tensor, bool is_virtual = false) {
+  // softmax's typical computation is:
+  // exp(input - reduce_max(input)) / reduce_sum(exp(input - reduce_max(input)))
+  // We need to create each op and add it to the op list sequentially.
+
+  // Copy all dims except the last dim since it's reduced to 1.
+  std::vector<int64_t> reduction_output_dim(dims.begin(), dims.end() - 1);
+  reduction_output_dim.push_back(1);
+
+  // Divide every stride by the last dim value.
+  std::vector<int64_t> reduction_output_stride;
+  int64_t reduced_dim_len = dims.back();
+  for (auto stride : strides) {
+    reduction_output_stride.push_back(stride / reduced_dim_len);
+  }
+
+  // Create output tensor of the first max reduction.
+  TF_ASSIGN_OR_RETURN(
+      auto max_reduction_output_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
+                        CudnnfMHAUid::VIRTUAL_ID + 500, dnn::DataType::kFloat,
+                        1, -1, /*is_virtual=*/true));
+
+  // Create the reduction descriptor
+  auto max_reduction_desc =
+      cudnn_frontend::ReductionDescBuilder()
+          .setComputeType(ToCudnnDataType(dnn::DataType::kFloat))
+          .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(max_reduction_desc);
+  // Create a reduction max node.
+  auto max_reduction_op = cudnn_frontend::OperationBuilder(
+                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                              .setxDesc(input_tensor)
+                              .setyDesc(max_reduction_output_tensor)
+                              .setreductionDesc(max_reduction_desc)
+                              .build();
+  RETURN_MSG_IF_CUDNN_ERROR(max_reduction_op);
+
+  // Create output tensor of the subtraction op.
+  TF_ASSIGN_OR_RETURN(
+      auto subtract_output_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 501,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+  // Create the subtraction descriptor
+  TF_ASSIGN_OR_RETURN(auto subtract_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_SUB));
+
+  // Create a subtraction node.
+  TF_ASSIGN_OR_RETURN(
+      auto subtract_op,
+      CreateBinaryPwOp(input_tensor, max_reduction_output_tensor,
+                       subtract_output_tensor, subtract_desc));
+  // Create output tensor of the exp op.
+  TF_ASSIGN_OR_RETURN(
+      auto exp_output_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 502,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+  // Create the exponetial descriptor
+  TF_ASSIGN_OR_RETURN(auto exp_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_EXP));
+
+  // Create a exponetial node.
+  TF_ASSIGN_OR_RETURN(
+      auto exp_op,
+      CreateUnaryPwOp(subtract_output_tensor, exp_output_tensor, exp_desc));
+
+  // Create output tensor of the sum reduction.
+  TF_ASSIGN_OR_RETURN(
+      auto sum_reduction_output_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
+                        CudnnfMHAUid::VIRTUAL_ID + 503, dnn::DataType::kFloat,
+                        1, -1, /*is_virtual=*/true));
+  // Create the reduction descriptor
+  auto sum_reduction_desc =
+      cudnn_frontend::ReductionDescBuilder()
+          .setComputeType(ToCudnnDataType(dnn::DataType::kFloat))
+          .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(sum_reduction_desc);
+  // Create a reduction sum node.
+  auto sum_reduction_op = cudnn_frontend::OperationBuilder(
+                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                              .setxDesc(exp_output_tensor)
+                              .setyDesc(sum_reduction_output_tensor)
+                              .setreductionDesc(sum_reduction_desc)
+                              .build();
+  RETURN_MSG_IF_CUDNN_ERROR(sum_reduction_op);
+
+  // Create output tensor of the log op.
+  TF_ASSIGN_OR_RETURN(
+      auto log_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
+                        CudnnfMHAUid::VIRTUAL_ID + 504, dnn::DataType::kFloat,
+                        1, -1,
+                        /*is_virtual*/ true));
+
+  // Create the log descriptor
+  TF_ASSIGN_OR_RETURN(auto log_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_LOG));
+
+  // Create a log node.
+  TF_ASSIGN_OR_RETURN(auto log_op, CreateUnaryPwOp(sum_reduction_output_tensor,
+                                                   log_tensor, log_desc));
+
+  // Create output tensor of the add op.
+  auto ID = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 505 : CudnnfMHAUid::P_ID;
+  TF_ASSIGN_OR_RETURN(
+      auto softmax_stats_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride, ID,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ is_virtual));
+
+  // Create the add descriptor
+  TF_ASSIGN_OR_RETURN(auto add_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ADD));
+
+  // Create a add node.
+  TF_ASSIGN_OR_RETURN(auto add_op,
+                      CreateBinaryPwOp(max_reduction_output_tensor, log_tensor,
+                                       softmax_stats_tensor, add_desc));
+
+  // Create output tensor of the divide op.
+  TF_ASSIGN_OR_RETURN(
+      auto divide_output_tensor,
+      CreateCudnnTensor(
+          dims, strides, CudnnfMHAUid::VIRTUAL_ID + 506, dnn::DataType::kFloat,
+          1, -1,
+          /*is_virtual*/ true,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
+  // Create the divide descriptor
+  TF_ASSIGN_OR_RETURN(auto divide_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_DIV));
+
+  // Create a divide node.
+  TF_ASSIGN_OR_RETURN(
+      auto divide_op,
+      CreateBinaryPwOp(exp_output_tensor, sum_reduction_output_tensor,
+                       divide_output_tensor, divide_desc));
+
+  // Add max reduction to op list
+  ops.push_back(std::move(max_reduction_op));
+  // Add subtract to op list
+  ops.push_back(std::move(subtract_op));
+  // Add exponetial to op list
+  ops.push_back(std::move(exp_op));
+  // Add sum reduction to op list
+  ops.push_back(std::move(sum_reduction_op));
+  // Add Log to op list
+  ops.push_back(std::move(log_op));
+  // Add Add to op list
+  ops.push_back(std::move(add_op));
+  // Add divide to op list
+  ops.push_back(std::move(divide_op));
+  return divide_output_tensor;
+}
+
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionDropoutFwdTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    cudnn_frontend::Tensor& input_tensor, double dropout_rate) {
+  // Create scale tensor
+  std::vector<int64_t> scale_dims(dims.size(), 1);
+  std::vector<int64_t> scale_strides(strides.size(), 1);
+
+  // Create tensor for dropout's mask.
+  TF_ASSIGN_OR_RETURN(
+      auto mask_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 600,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  // Create output tensor of dropout node
+  // it is different from regular attention, the dropout output is always
+  // virtual we compute mask in the bwd instead of storing the mask
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_out_tensor,
+      CreateCudnnTensor(
+          dims, strides, CudnnfMHAUid::VIRTUAL_ID + 601, dtype, 1, -1,
+          /*is_virtual*/ true,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16));
+
+  // Create offset tensor of dropout node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_offset_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::D_OFFSET_ID,
+          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+
+  // Create seed tensor of dropout node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_seed_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::D_SEED_ID,
+          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+
+  // Create description for rng node
+  auto rng_desc = cudnn_frontend::RngDescBuilder()
+                      .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                      .setBernoulliDistProbability(1.0 - dropout_rate)
+                      .build();
+
+  // Create the rng Node.
+  auto rng_op =
+      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+          .setyDesc(mask_tensor)
+          .setSeedDesc(dropout_seed_tensor)
+          .setOffsetDesc(dropout_offset_tensor)
+          .setRngDesc(rng_desc)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(rng_op);
+
+  // Create the masking node desc after mask tensor
+  TF_ASSIGN_OR_RETURN(auto masking_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto masking_op,
+                      CreateBinaryPwOp(input_tensor, mask_tensor,
+                                       dropout_out_tensor, masking_desc));
+
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  // Create output of scale node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_out_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 602, dtype, 1,
+                        -1, /*is_virtual*/ true));
+  // Create the scaling desc
+  TF_ASSIGN_OR_RETURN(auto scale_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto scale_op,
+                      CreateBinaryPwOp(dropout_out_tensor, dropout_scale_tensor,
+                                       dropout_scale_out_tensor, scale_desc));
+  // Add rng op to op list
+  ops.push_back(std::move(rng_op));
+  // Add masking op to op list
+  ops.push_back(std::move(masking_op));
+  // Add scaling op to op list
+  ops.push_back(std::move(scale_op));
+
+  return dropout_scale_out_tensor;
+}
+
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+GetCudnnFlashAttentionOperationGraph(
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    std::optional<dnn::TensorDescriptor> mask_descriptor,
+    std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<dnn::TensorDescriptor> activation_descriptor,
+    dnn::FusedMHAKind kind, std::optional<double> dropout_rate,
+    std::optional<int64_t> seed, CudnnHandle& cudnn, double scale,
+    std::vector<int64_t>& intermediate_shape, bool use_dropout = false,
+    bool use_mask = false, bool use_bias = false,
+    bool use_causal_mask = false) {
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "\n bmm1_lhs(q): " << bmm1_lhs_descriptor.ToString()
+            << "\n bmm1_rhs(k): " << bmm1_rhs_descriptor.ToString()
+            << "\n bmm2_lhs(s): " << intermediate_bmm2_lhs_descriptor.ToString()
+            << "\n bmm2_rhs(v): " << bmm2_rhs_descriptor.ToString()
+            << "\n out(o): " << output_descriptor.ToString();
+    if (activation_descriptor) {
+      VLOG(4) << "\n activation(s): " << (*activation_descriptor).ToString();
+    }
+  }
+
+  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
+  // created for convolution for fMHA.
+  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
+
+  std::vector<cudnn_frontend::Operation const*> ops;
+  std::vector<cudnn_frontend::Operation> intermediate_ops;
+
+  // Batched Matmul: bmm1_lhs: tensor_q, bmm1_rhs:tensor_k; output: tensor_s
+  // (virtual)
+  // Batched Matmul: bmm2_lhs: tensor_s, bmm2_rhs:tensor_v; output: tensor_o
+  std::vector<int64_t> bmm1_lhs_dims =
+      bmm1_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+  std::vector<int64_t> bmm1_lhs_strides =
+      bmm1_lhs_descriptor.GetCudnnCompatibleStrides(true);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_lhs_dims: "
+          << absl::StrJoin(bmm1_lhs_dims, ",")
+          << "\n cuDNN compatible bmm1_lhs_strides: "
+          << absl::StrJoin(bmm1_lhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_q,
+      CreateCudnnTensor(bmm1_lhs_dims, bmm1_lhs_strides, CudnnfMHAUid::Q_ID,
+                        bmm1_lhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> bmm1_rhs_dims =
+      bmm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> bmm1_rhs_strides =
+      bmm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_rhs_dims: "
+          << absl::StrJoin(bmm1_rhs_dims, ",")
+          << "\n cuDNN compatible bmm1_rhs_strides: "
+          << absl::StrJoin(bmm1_rhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_k,
+      CreateCudnnTensor(bmm1_rhs_dims, bmm1_rhs_strides, CudnnfMHAUid::K_ID,
+                        bmm1_rhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> intermediate_bmm2_lhs_dims =
+      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+  std::vector<int64_t> intermediate_bmm2_lhs_strides =
+      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleStrides(true);
+
+  VLOG(2) << "\n cuDNN compatible intermediate_bmm2_lhs_dims: "
+          << absl::StrJoin(intermediate_bmm2_lhs_dims, ",")
+          << "\n cuDNN compatible intermediate_bmm2_lhs_strides: "
+          << absl::StrJoin(intermediate_bmm2_lhs_strides, ",");
+  intermediate_shape = intermediate_bmm2_lhs_dims;
+  bool has_activation = activation_descriptor != std::nullopt;
+
+  TF_ASSIGN_OR_RETURN(auto tensor_s,
+                      CreateCudnnTensor(intermediate_bmm2_lhs_dims,
+                                        intermediate_bmm2_lhs_strides,
+                                        CudnnfMHAUid::VIRTUAL_ID + 100,
+                                        dnn::DataType::kFloat, 1, -1,
+                                        /*is_virtual=*/true));
+
+  auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_desc);
+  auto bmm1_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                     .setaMatDesc(tensor_q)
+                     .setbMatDesc(tensor_k)
+                     .setcMatDesc(tensor_s)
+                     .setmatmulDesc(bmm1_desc)
+                     .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_op);
+  intermediate_ops.push_back(std::move(bmm1_op));
+
+  // Create scale op and tensor
+  TF_ASSIGN_OR_RETURN(
+      auto alpha_scale_out,
+      CreateCudnnScaleTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
+                             intermediate_bmm2_lhs_strides,
+                             dnn::DataType::kFloat, tensor_s));
+
+  auto bmm2_input_tensor = std::move(alpha_scale_out);
+
+  if (use_bias) {
+    // Create bias op and tensor
+    TF_ASSIGN_OR_RETURN(auto bias_out,
+                        CreateCudnnFlashAttentionBiasFwdTensor(
+                            intermediate_ops, intermediate_bmm2_lhs_dims,
+                            intermediate_bmm2_lhs_strides,
+                            (*bias_descriptor).type(), bmm2_input_tensor));
+    bmm2_input_tensor = std::move(bias_out);
+  }
+
+  if (use_causal_mask) {
+    // Create mask op and tensor
+    TF_ASSIGN_OR_RETURN(
+        auto mask_out,
+        CreateCudnnFlashAttentionCausalMaskTensor(
+            intermediate_ops, intermediate_bmm2_lhs_dims,
+            intermediate_bmm2_lhs_strides,
+            intermediate_bmm2_lhs_descriptor.type(), bmm2_input_tensor));
+    bmm2_input_tensor = std::move(mask_out);
+  }
+
+  // Create Softmax tensor
+  // The output is always a virtual for inference mode.
+  // The output is always non virtual for training mode. cuz we recompute
+  // dropout in bwd.;
+  bool should_output_softmax = has_activation;
+  TF_ASSIGN_OR_RETURN(auto softmax_fwd_out,
+                      CreateCudnnFlashAttentionSoftmaxFwdTensor(
+                          intermediate_ops, intermediate_bmm2_lhs_dims,
+                          intermediate_bmm2_lhs_strides,
+                          intermediate_bmm2_lhs_descriptor.type(),
+                          /*input_tensor*/ bmm2_input_tensor,
+                          /*is_virtual*/ !should_output_softmax));
+  bmm2_input_tensor = std::move(softmax_fwd_out);
+
+  // Create dropout tensor
+  // dropout is always virtual in inference or training for flash attention
+  TF_ASSIGN_OR_RETURN(auto dropout_out,
+                      CreateCudnnFlashAttentionDropoutFwdTensor(
+                          intermediate_ops, intermediate_bmm2_lhs_dims,
+                          intermediate_bmm2_lhs_strides,
+                          intermediate_bmm2_lhs_descriptor.type(),
+                          /*input_tensor*/ softmax_fwd_out, *dropout_rate));
+  bmm2_input_tensor = std::move(dropout_out);
+
+  std::vector<int64_t> bmm2_rhs_dims =
+      bmm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> bmm2_rhs_strides =
+      bmm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible bmm2_rhs_dims: "
+          << absl::StrJoin(bmm2_rhs_dims, ",")
+          << "\n cuDNN compatible bmm2_rhs_strides: "
+          << absl::StrJoin(bmm2_rhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_v,
+      CreateCudnnTensor(bmm2_rhs_dims, bmm2_rhs_strides, CudnnfMHAUid::V_ID,
+                        bmm2_rhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> output_dims = output_descriptor.dimensions();
+  std::vector<int64_t> output_strides = output_descriptor.GetLogicalStrides();
+
+  VLOG(2) << "\n Out Dims: " << absl::StrJoin(output_dims, ",")
+          << "\n Out Strides: " << absl::StrJoin(output_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_o,
+      CreateCudnnTensor(output_dims, output_strides, CudnnfMHAUid::O_ID,
+                        output_descriptor.type(), 1, -1));
+  auto bmm2_desc = cudnn_frontend::MatMulDescBuilder()
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_desc);
+  auto bmm2_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                     .setaMatDesc(bmm2_input_tensor)
+                     .setbMatDesc(tensor_v)
+                     .setcMatDesc(tensor_o)
+                     .setmatmulDesc(bmm2_desc)
+                     .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_op);
+  // Create an Operation Graph. In this case it is gemm-gemm
+  intermediate_ops.push_back(std::move(bmm2_op));
+  ops.reserve(intermediate_ops.size());
+  for (auto& intermediate_op : intermediate_ops) {
+    ops.emplace_back(&intermediate_op);
+  }
+
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(cudnn.handle())
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
+  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
+          << "\nTensor_k: " << tensor_k.describe()
+          << "\nTensor_s: " << tensor_s.describe()
+          << "\nTensor_v: " << tensor_v.describe()
+          << "\nTensor_o: " << tensor_o.describe()
+          << "\nBMM1: " << bmm1_desc.describe()
+          << "\nBMM2: " << bmm2_desc.describe()
+          << "\nOpGraph: " << op_graph.describe();
+  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
+}
+
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionDropoutBwdTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    cudnn_frontend::Tensor& input_tensor, cudnn_frontend::Tensor& mask_tensor,
+    double dropout_rate) {
+  // Create scale tensor
+  std::vector<int64_t> scale_dims(dims.size(), 1);
+  std::vector<int64_t> scale_strides(strides.size(), 1);
+
+  // Create output tensor of dropout node
+  // it is different from regular attention, the dropout output is always
+  // virtual we compute mask in the bwd instead of storing the mask
+
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_out_tensor,
+      CreateCudnnTensor(
+          dims, strides, CudnnfMHAUid::VIRTUAL_ID + 601, dtype, 1, -1,
+          /*is_virtual*/ true,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16));
+
+  // flash attention TODO: set byValue to true if host pointer is supported
+  // Create offset tensor of dropout node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_offset_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::D_OFFSET_ID,
+          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+
+  // Create seed tensor of dropout node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_seed_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::D_SEED_ID,
+          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+
+  // Create description for rng node
+  auto rng_desc = cudnn_frontend::RngDescBuilder()
+                      .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                      .setBernoulliDistProbability(1.0 - dropout_rate)
+                      .build();
+
+  // Create the rng Node.
+  auto rng_op =
+      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+          .setyDesc(mask_tensor)
+          .setSeedDesc(dropout_seed_tensor)
+          .setOffsetDesc(dropout_offset_tensor)
+          .setRngDesc(rng_desc)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(rng_op);
+
+  // Create the masking node desc after mask tensor
+  TF_ASSIGN_OR_RETURN(auto masking_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto masking_op,
+                      CreateBinaryPwOp(input_tensor, mask_tensor,
+                                       dropout_out_tensor, masking_desc));
+
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  // Create output of scale node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_out_tensor,
+      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 602, dtype, 1,
+                        -1, /*is_virtual*/ true));
+  // Create the scaling desc
+  TF_ASSIGN_OR_RETURN(auto scale_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  // Create the scaling op
+  TF_ASSIGN_OR_RETURN(auto scale_op,
+                      CreateBinaryPwOp(dropout_out_tensor, dropout_scale_tensor,
+                                       dropout_scale_out_tensor, scale_desc));
+  // Add rng op to op list
+  ops.push_back(std::move(rng_op));
+  // Add masking op to op list
+  ops.push_back(std::move(masking_op));
+  // Add scaling op to op list
+  ops.push_back(std::move(scale_op));
+
+  return dropout_scale_out_tensor;
+}
+
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+GetCudnnFlashAttentionBackwardOperationGraph(
+    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& d_output_descriptor,
+    const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
+    const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
+    const dnn::TensorDescriptor& d_bmm2_rhs_descriptor, dnn::FusedMHAKind kind,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    CudnnHandle& cudnn, double scale, std::vector<int64_t>& intermediate_shape,
+    bool use_dropout = false, bool use_mask = false, bool use_bias = false,
+    bool use_causal_mask = false) {
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "\n bmm1_grad_gemm1_rhs(q): "
+            << bmm1_grad_gemm1_rhs_descriptor.ToString()
+            << "\n bmm1_grad_gemm2_rhs(k): "
+            << bmm1_grad_gemm2_rhs_descriptor.ToString()
+            << "\n bmm2_grad_gemm1_lhs(p): "
+            << bmm2_grad_gemm1_lhs_descriptor.ToString()
+            << "\n bmm2_grad_gemm2_rhs(v^t): "
+            << bmm2_grad_gemm2_rhs_descriptor.ToString()
+            << "\n d_output(do): " << d_output_descriptor.ToString()
+            << "\n d_bmm1_lhs(dq): " << d_bmm1_lhs_descriptor.ToString()
+            << "\n d_bmm1_rhs(dk): " << d_bmm1_rhs_descriptor.ToString()
+            << "\n d_bmm2_rhs(dv): " << d_bmm2_rhs_descriptor.ToString();
+  }
+  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
+  // created for convolution for fMHA.
+  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
+
+  std::vector<cudnn_frontend::Operation const*> ops;
+  std::vector<cudnn_frontend::Operation> intermediate_ops;
+
+  // fp16 or bf16 is required
+  auto dtype = bmm1_grad_gemm1_rhs_descriptor.type();
+  // create input tensor Q
+  std::vector<int64_t> q_dims =
+      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> q_strides =
+      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  // used for create scale tensor or zero tensor
+  std::vector<int64_t> scale_dims(q_dims.size(), 1);
+  std::vector<int64_t> scale_strides(q_strides.size(), 1);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm1_rhs_dims: "
+          << absl::StrJoin(q_dims, ",")
+          << "\n cuDNN compatible bmm1_grad_gemm1_rhs_strides: "
+          << absl::StrJoin(q_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_q,
+      CreateCudnnTensor(q_dims, q_strides, CudnnfMHAUid::Q_ID, dtype, 1, -1));
+
+  // create input tensor K^T
+  std::vector<int64_t> k_transpose_dims =
+      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(true);
+  std::vector<int64_t> k_transpose_strides =
+      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(true);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm2_rhs_dims: "
+          << absl::StrJoin(k_transpose_dims, ",")
+          << "\n cuDNN compatible bmm1_grad_gemm2_rhs_strides: "
+          << absl::StrJoin(k_transpose_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_kt,
+                      CreateCudnnTensor(k_transpose_dims, k_transpose_strides,
+                                        CudnnfMHAUid::K_ID, dtype, 1, -1));
+
+  // P^T is lhs of bmm2grad1 dV = dot(P^T, dO) so we set is_lhs = false here to
+  // get correct P dim and stride
+  std::vector<int64_t> p_dims =
+      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> p_strides =
+      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  // used for calculate offset increment
+  intermediate_shape = p_dims;
+  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm1_lhs_dims: "
+          << absl::StrJoin(p_dims, ",")
+          << "\n cuDNN compatible bmm2_grad_gemm1_lhs_strides: "
+          << absl::StrJoin(p_strides, ",");
+
+  // create input tensor V^T
+  std::vector<int64_t> v_transpose_dims =
+      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> v_transpose_strides =
+      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm2_rhs_dims: "
+          << absl::StrJoin(v_transpose_dims, ",")
+          << "\n cuDNN compatible bmm2_grad_gemm2_rhs_strides: "
+          << absl::StrJoin(v_transpose_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_vt,
+                      CreateCudnnTensor(v_transpose_dims, v_transpose_strides,
+                                        CudnnfMHAUid::V_ID, dtype, 1, -1));
+
+  // create input tensor dO
+  // FLASH ATTENTION TODO: be really careful here about dim
+  std::vector<int64_t> do_dims =
+      d_output_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> do_strides =
+      d_output_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible d_output_dims: "
+          << absl::StrJoin(do_dims, ",")
+          << "\n cuDNN compatible d_output_strides: "
+          << absl::StrJoin(do_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_do,
+                      CreateCudnnTensor(do_dims, do_strides,
+                                        CudnnfMHAUid::dO_ID, dtype, 1, -1));
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_o,
+      CreateCudnnTensor(do_dims, do_strides, CudnnfMHAUid::O_ID, dtype, 1, -1));
+
+  // create output tensor dQ
+  std::vector<int64_t> dq_dims = d_bmm1_lhs_descriptor.dimensions();
+  std::vector<int64_t> dq_strides = d_bmm1_lhs_descriptor.GetLogicalStrides();
+
+  VLOG(2) << "\n cuDNN compatible d_bmm1_lhs_dims: "
+          << absl::StrJoin(dq_dims, ",")
+          << "\n cuDNN compatible d_bmm1_lhs_strides: "
+          << absl::StrJoin(dq_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_dq,
+                      CreateCudnnTensor(dq_dims, dq_strides,
+                                        CudnnfMHAUid::dQ_ID, dtype, 1, -1));
+
+  // create output tensor dK
+  std::vector<int64_t> dk_dims = d_bmm1_rhs_descriptor.dimensions();
+  std::vector<int64_t> dk_strides = d_bmm1_rhs_descriptor.GetLogicalStrides();
+
+  VLOG(2) << "\n cuDNN compatible d_bmm1_rhs_dims: "
+          << absl::StrJoin(dk_dims, ",")
+          << "\n cuDNN compatible d_bmm1_rhs_strides: "
+          << absl::StrJoin(dk_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_dk,
+                      CreateCudnnTensor(dk_dims, dk_strides,
+                                        CudnnfMHAUid::dK_ID, dtype, 1, -1));
+
+  // create output tensor dV
+  std::vector<int64_t> dv_dims = d_bmm2_rhs_descriptor.dimensions();
+  std::vector<int64_t> dv_strides = d_bmm2_rhs_descriptor.GetLogicalStrides();
+
+  VLOG(2) << "\n cuDNN compatible d_bmm2_rhs_dims: "
+          << absl::StrJoin(dv_dims, ",")
+          << "\n cuDNN compatible d_bmm2_rhs_strides: "
+          << absl::StrJoin(dv_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_dv,
+                      CreateCudnnTensor(dv_dims, dv_strides,
+                                        CudnnfMHAUid::dV_ID, dtype, 1, -1));
+
+  // Begin backward graph creation
+  // dO * O
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dot_product,
+      CreateCudnnTensor(do_dims, do_strides, CudnnfMHAUid::VIRTUAL_ID + 100,
+                        dnn::DataType::kFloat, 1, -1, /*is_virtual*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto mul_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+
+  TF_ASSIGN_OR_RETURN(
+      auto mul_op,
+      CreateBinaryPwOp(tensor_do, tensor_o, tensor_dot_product, mul_desc));
+
+  intermediate_ops.push_back(std::move(mul_op));
+
+  // reduction(dO * O)
+  std::vector<int64_t> do_reduction_dims(do_dims.begin(), do_dims.end() - 1);
+  do_reduction_dims.push_back(1);
+
+  // Divide every stride by the last dim value.
+  std::vector<int64_t> do_reduction_strides;
+  do_reduction_strides.reserve(do_strides.size());
+  int64_t reduced_dim_len = do_dims.back();
+  for (auto stride : do_strides) {
+    do_reduction_strides.push_back(stride / reduced_dim_len);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dot_product_reduction,
+      CreateCudnnTensor(do_reduction_dims, do_reduction_strides,
+                        CudnnfMHAUid::VIRTUAL_ID + 101, dnn::DataType::kFloat,
+                        1, -1, /*is_virtual*/ true));
+
+  auto reduction_add_desc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_desc);
+  auto reduction_add_op = cudnn_frontend::OperationBuilder(
+                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                              .setxDesc(tensor_dot_product)
+                              .setyDesc(tensor_dot_product_reduction)
+                              .setreductionDesc(reduction_add_desc)
+                              .build();
+  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_op);
+  intermediate_ops.push_back(std::move(reduction_add_op));
+
+  // reduction(dO * O) * scale prob -> softmax_sum
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_scale_prob,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::SCALE_PROB_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_softmax_sum,
+      CreateCudnnTensor(do_reduction_dims, do_reduction_strides,
+                        CudnnfMHAUid::S_SUM_ID, dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ false));
 
-  // reshape + scale dropout + abs for p
-  auto p_transpose_dims = p_dims;
-  auto p_transpose_strides = p_strides;
-  auto rank = p_transpose_dims.size();
-  std::swap(p_transpose_dims[rank - 1], p_transpose_dims[rank - 2]);
-  std::swap(p_transpose_strides[rank - 1], p_transpose_strides[rank - 2]);
+  TF_ASSIGN_OR_RETURN(
+      auto mul_0_op,
+      CreateBinaryPwOp(tensor_dot_product_reduction, tensor_scale_prob,
+                       tensor_softmax_sum, mul_desc));
+  intermediate_ops.push_back(std::move(mul_0_op));
 
-  TF_ASSIGN_OR_RETURN(auto tensor_p_transpose,
-                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                                        VIRTUAL_ID + 300, dtype, 1, -1,
-                                        /* is_virtual */ true));
+  // Q @ K.T -> P
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_p,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 102,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
 
-  auto reshape_op = cudnn_frontend::OperationBuilder(
-                        CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                        .setxDesc(tensor_p)
-                        .setyDesc(tensor_p_transpose)
-                        .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reshape_op);
+  auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_desc);
+  auto bmm1_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                     .setaMatDesc(tensor_q)
+                     .setbMatDesc(tensor_kt)
+                     .setcMatDesc(tensor_p)
+                     .setmatmulDesc(bmm1_desc)
+                     .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_op);
+  intermediate_ops.push_back(std::move(bmm1_op));
 
-  // Create scale tensor
+  // P * alpha_scale -> p_after_alpha_scale
   TF_ASSIGN_OR_RETURN(
-      auto tensor_dropout_scale,
+      auto tensor_alpha_scale,
       CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
+          scale_dims, scale_strides, CudnnfMHAUid::ALPHA_SCALE_ID,
           dnn::DataType::kFloat, 1, -1,
           /*is_virtual*/ false,
           /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
           /*is_value*/ true));
 
-  // Create output of scale
   TF_ASSIGN_OR_RETURN(
-      auto tensor_p_transpose_scale,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides, VIRTUAL_ID + 301,
-                        dtype, 1, -1, /*is_virtual*/ true));
-  // Create the scaling desc
-  TF_ASSIGN_OR_RETURN(auto scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
+      auto tensor_p_after_alpha_scale,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 103,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  TF_ASSIGN_OR_RETURN(auto mul_1_op,
+                      CreateBinaryPwOp(tensor_p, tensor_alpha_scale,
+                                       tensor_p_after_alpha_scale, mul_desc));
+  intermediate_ops.push_back(std::move(mul_1_op));
 
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp(tensor_p_transpose, tensor_dropout_scale,
-                                       tensor_p_transpose_scale, scale_desc));
-  // create abs operation here to clear the sign bit
-  // sign bit is used to store the mask for dropout
+  if (use_bias) {
+    // bias -> p_after_bias
+    TF_ASSIGN_OR_RETURN(auto tensor_p_after_bias,
+                        CreateCudnnFlashAttentionBiasFwdTensor(
+                            intermediate_ops, p_dims, p_strides, dtype,
+                            tensor_p_after_alpha_scale));
+    tensor_p_after_alpha_scale = std::move(tensor_p_after_bias);
+  }
+  if (use_causal_mask) {
+    // Causal masking -> p_after_mask
+    TF_ASSIGN_OR_RETURN(auto tensor_p_after_causal_mask,
+                        CreateCudnnFlashAttentionCausalMaskTensor(
+                            intermediate_ops, p_dims, p_strides, dtype,
+                            tensor_p_after_alpha_scale));
+    tensor_p_after_alpha_scale = std::move(tensor_p_after_causal_mask);
+  }
+  auto tensor_p_after_bias_or_mask = std::move(tensor_p_after_alpha_scale);
+  // p_after_mask - softmax_stats -> p_after_sub
   TF_ASSIGN_OR_RETURN(
-      auto tensor_p_transpose_scale_abs,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides, VIRTUAL_ID + 302,
-                        dtype, 1, -1, /*is_virtual*/ true));
+      auto tensor_p_after_sub,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 104,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_softmax_stats,
+      CreateCudnnTensor(do_reduction_dims, do_reduction_strides,
+                        CudnnfMHAUid::P_ID, dnn::DataType::kFloat, 1, -1));
 
-  TF_ASSIGN_OR_RETURN(auto abs_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
+  TF_ASSIGN_OR_RETURN(auto sub_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_SUB));
+  TF_ASSIGN_OR_RETURN(
+      auto sub_0_op,
+      CreateBinaryPwOp(tensor_p_after_bias_or_mask, tensor_softmax_stats,
+                       tensor_p_after_sub, sub_desc));
+  intermediate_ops.push_back(std::move(sub_0_op));
 
-  TF_ASSIGN_OR_RETURN(auto abs_op,
-                      CreateUnaryPwOp(tensor_p_transpose_scale,
-                                      tensor_p_transpose_scale_abs, abs_desc));
+  // e^(p_after_sub) -> p_after_softmax
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_p_after_softmax,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 105,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto exp_0_desc,
+                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_EXP));
+  TF_ASSIGN_OR_RETURN(
+      auto exp_0_op,
+      CreateUnaryPwOp(tensor_p_after_sub, tensor_p_after_softmax, exp_0_desc));
+  intermediate_ops.push_back(std::move(exp_0_op));
+
+  // Dropout -> p_after_scale_dropout
+  // Create tensor for dropout's mask
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dropout_mask,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 106,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_p_after_scale_dropout,
+      CreateCudnnFlashAttentionDropoutBwdTensor(
+          intermediate_ops, p_dims, p_strides, dtype, tensor_p_after_softmax,
+          tensor_dropout_mask, *dropout_rate));
 
+  // after_scale_dropout -> s_transpose
+  auto p_transpose_dims = p_dims;
+  auto p_transpose_strides = p_strides;
+  auto p_rank = p_transpose_dims.size();
+  std::swap(p_transpose_dims[p_rank - 1], p_transpose_dims[p_rank - 2]);
+  std::swap(p_transpose_strides[p_rank - 1], p_transpose_strides[p_rank - 2]);
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_s_transpose,
+      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                        CudnnfMHAUid::VIRTUAL_ID + 107, dtype, 1, -1,
+                        /*is_virtual*/ true));
+  auto reshape_op = cudnn_frontend::OperationBuilder(
+                        CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                        .setxDesc(tensor_p_after_scale_dropout)
+                        .setyDesc(tensor_s_transpose)
+                        .build();
+  RETURN_MSG_IF_CUDNN_ERROR(reshape_op);
   intermediate_ops.push_back(std::move(reshape_op));
-  intermediate_ops.push_back(std::move(scale_op));
-  intermediate_ops.push_back(std::move(abs_op));
 
-  // matmul to calculate dv
+  // s_transpose @ dO -> dV
   auto bmm2_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
                                   .setComputeType(CUDNN_DATA_FLOAT)
                                   .build();
   RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_desc);
   auto bmm2_grad_gemm1_op = cudnn_frontend::OperationBuilder(
                                 CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_p_transpose_scale_abs)
+                                .setaMatDesc(tensor_s_transpose)
                                 .setbMatDesc(tensor_do)
                                 .setcMatDesc(tensor_dv)
                                 .setmatmulDesc(bmm2_grad_gemm1_desc)
                                 .build();
   RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_op);
-  VLOG(4) << "\nBMM2_grad_gemm1: " << bmm2_grad_gemm1_desc.describe()
-          << "\nBMM2_grad_gemm1_op: " << bmm2_grad_gemm1_op.describe();
-
   intermediate_ops.push_back(std::move(bmm2_grad_gemm1_op));
 
-  // matmul to calculate dp
+  // dO @ V^t -> dS
   TF_ASSIGN_OR_RETURN(
-      auto tensor_dp,
-      CreateCudnnTensor(p_dims, p_strides, VIRTUAL_ID + 303,
-                        dnn::DataType::kFloat, 1,
-                        -1,  // FMHA TODO TYPE: why it is float here?
-                        /* is_virtual */ true));
+      auto tensor_ds,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 108,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
 
   auto bmm2_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
                                   .setComputeType(CUDNN_DATA_FLOAT)
@@ -5966,107 +7156,169 @@ GetCudnnFusedMHABackwardOperationGraph(
                                 CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
                                 .setaMatDesc(tensor_do)
                                 .setbMatDesc(tensor_vt)
-                                .setcMatDesc(tensor_dp)
+                                .setcMatDesc(tensor_ds)
                                 .setmatmulDesc(bmm2_grad_gemm2_desc)
                                 .build();
   RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_op);
-  VLOG(4) << "\nBMM2_grad_gemm2: " << bmm2_grad_gemm2_desc.describe()
-          << "\nBMM2_grad_gemm2_op: " << bmm2_grad_gemm2_op.describe();
-
   intermediate_ops.push_back(std::move(bmm2_grad_gemm2_op));
 
-  // mask out the sign bit here
+  // dS * dropout -> dS_after_dropout
   TF_ASSIGN_OR_RETURN(
-      auto tensor_p_abs,
-      CreateCudnnTensor(p_dims, p_strides, VIRTUAL_ID + 304, dtype, 1, -1,
-                        /* is_virtual */ true));
+      auto tensor_ds_after_dropout,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 109,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
 
-  TF_ASSIGN_OR_RETURN(auto p_abs_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
+  TF_ASSIGN_OR_RETURN(auto mul_2_op,
+                      CreateBinaryPwOp(tensor_ds, tensor_dropout_mask,
+                                       tensor_ds_after_dropout, mul_desc));
+  intermediate_ops.push_back(std::move(mul_2_op));
 
-  TF_ASSIGN_OR_RETURN(auto p_abs_op,
-                      CreateUnaryPwOp(tensor_p, tensor_p_abs, p_abs_desc));
-  intermediate_ops.push_back(std::move(p_abs_op));
+  // dS_after_dropout - softmax_sum -> dS_after_sub
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_ds_after_sub,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 110,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
 
-  // dropout backward
   TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scale_dropout,
-      CreateCudnnDropoutBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                  tensor_dropout_scale, tensor_p, tensor_p_abs,
-                                  tensor_dp));
-  // softmax backward
+      auto sub_1_op,
+      CreateBinaryPwOp(tensor_ds_after_dropout, tensor_softmax_sum,
+                       tensor_ds_after_sub, sub_desc));
+  intermediate_ops.push_back(std::move(sub_1_op));
+
+  // dS_after_sub * p_after_softmax -> dP
   TF_ASSIGN_OR_RETURN(
-      auto tensor_ds,
-      CreateCudnnSoftmaxBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                  tensor_p_abs, tensor_dp_scale_dropout));
-  // mask backward
+      auto tensor_dp,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 111,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto mul_3_op, CreateBinaryPwOp(tensor_ds_after_sub,
+                                                      tensor_p_after_softmax,
+                                                      tensor_dp, mul_desc));
+  intermediate_ops.push_back(std::move(mul_3_op));
+
+  // dP * dropout_scale -> dP_after_dropout_scale
+  // flash attention TODO: make sure the data type is correct here
   TF_ASSIGN_OR_RETURN(
-      auto tensor_ds_mask,
-      CreateCudnnMaskBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                               tensor_ds, use_mask));
+      auto tensor_dp_after_dropout_scale,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 112,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
 
-#if (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
-  // bias backward
-  if (use_bias) {
-    // bias backward
-    TF_ASSIGN_OR_RETURN(
-        auto tensor_dbias,
-        CreateCudnnBiasBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                 tensor_ds_mask));
-  }
-#else
-  return absl::InternalError("Bias backward op requires cudnn >= 8.9.1");
-#endif
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dropout_scale,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
+          dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
 
-  // calculate dq
-  auto bmm1_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
+  TF_ASSIGN_OR_RETURN(
+      auto mul_4_op, CreateBinaryPwOp(tensor_dp, tensor_dropout_scale,
+                                      tensor_dp_after_dropout_scale, mul_desc));
+  intermediate_ops.push_back(std::move(mul_4_op));
+
+  // dP_after_dropout_scale * alpha_scale -> dP_scaled
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dp_scaled,
+      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 113,
+                        dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  TF_ASSIGN_OR_RETURN(
+      auto mul_5_op,
+      CreateBinaryPwOp(tensor_dp_after_dropout_scale, tensor_alpha_scale,
+                       tensor_dp_scaled, mul_desc));
+  intermediate_ops.push_back(std::move(mul_5_op));
+
+  // K^T -> K
+  auto k_dims = k_transpose_dims;
+  auto k_strides = k_transpose_strides;
+  auto k_rank = k_dims.size();
+  std::swap(k_dims[k_rank - 1], k_dims[k_rank - 2]);
+  std::swap(k_strides[k_rank - 1], k_strides[k_rank - 2]);
+
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_k,
+      CreateCudnnTensor(k_dims, k_strides, CudnnfMHAUid::VIRTUAL_ID + 114,
+                        dtype, 1, -1, /*is_virtual*/ true));
+  auto reshape_1_op = cudnn_frontend::OperationBuilder(
+                          CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(tensor_kt)
+                          .setyDesc(tensor_k)
+                          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(reshape_1_op);
+  intermediate_ops.push_back(std::move(reshape_1_op));
+
+  // dP_scaled @ K -> d_Q_accum
+  auto tensor_d_Q_accum =
+      cudnn_frontend::TensorBuilder()
+          .setDim(dq_dims.size(), dq_dims.data())
+          .setStride(dq_strides.size(), dq_strides.data())
+          .setId(CudnnfMHAUid::d_Q_accum_ID)
+          .setAlignment(16)
+          .setDataType(ToCudnnDataType(dnn::DataType::kFloat))
+          .setVectorCountAndDimension(1, -1)
+          .setVirtual(false)
+          .setReorderType(CUDNN_TENSOR_REORDERING_F16x16)
+          .setByValue(false)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(tensor_d_Q_accum);
+
+  auto bmm1_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
                                   .setComputeType(CUDNN_DATA_FLOAT)
                                   .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_desc);
-  auto bmm1_grad_gemm2_op = cudnn_frontend::OperationBuilder(
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_desc);
+  auto bmm1_grad_gemm1_op = cudnn_frontend::OperationBuilder(
                                 CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_ds_mask)
+                                .setaMatDesc(tensor_dp_scaled)
                                 .setbMatDesc(tensor_k)
-                                .setcMatDesc(tensor_dq)
-                                .setmatmulDesc(bmm1_grad_gemm2_desc)
+                                .setcMatDesc(tensor_d_Q_accum)
+                                .setmatmulDesc(bmm1_grad_gemm1_desc)
                                 .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_op);
-  VLOG(4) << "\nBMM1_grad_gemm2: " << bmm1_grad_gemm2_desc.describe()
-          << "\nBMM1_grad_gemm2_op: " << bmm1_grad_gemm2_op.describe();
-
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm2_op));
-
-  // calculate dk
-  TF_ASSIGN_OR_RETURN(auto tensor_ds_mask_reshape,
-                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                                        VIRTUAL_ID + 305, dtype, 1, -1,
-                                        /* is_virtual */ true));
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_op);
+  intermediate_ops.push_back(std::move(bmm1_grad_gemm1_op));
 
+  // dP_scaled.T @ Q -> dK
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_dp_scaled_transpose,
+      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
+                        CudnnfMHAUid::VIRTUAL_ID + 115, dnn::DataType::kFloat,
+                        1, -1, /*is_virtual*/ true));
   auto reshape_2_op = cudnn_frontend::OperationBuilder(
                           CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                          .setxDesc(tensor_ds_mask)
-                          .setyDesc(tensor_ds_mask_reshape)
+                          .setxDesc(tensor_dp_scaled)
+                          .setyDesc(tensor_dp_scaled_transpose)
                           .build();
-
+  RETURN_MSG_IF_CUDNN_ERROR(reshape_2_op);
   intermediate_ops.push_back(std::move(reshape_2_op));
-  auto bmm1_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
+
+  auto bmm1_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
                                   .setComputeType(CUDNN_DATA_FLOAT)
                                   .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_desc);
-  auto bmm1_grad_gemm1_op = cudnn_frontend::OperationBuilder(
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_desc);
+  auto bmm1_grad_gemm2_op = cudnn_frontend::OperationBuilder(
                                 CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_ds_mask_reshape)
+                                .setaMatDesc(tensor_dp_scaled_transpose)
                                 .setbMatDesc(tensor_q)
                                 .setcMatDesc(tensor_dk)
-                                .setmatmulDesc(bmm1_grad_gemm1_desc)
+                                .setmatmulDesc(bmm1_grad_gemm2_desc)
                                 .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_op);
-  VLOG(4) << "\nBMM1_grad_gemm1: " << bmm1_grad_gemm1_desc.describe()
-          << "\nBMM1_grad_gemm1_op: " << bmm1_grad_gemm1_op.describe();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_op);
+  intermediate_ops.push_back(std::move(bmm1_grad_gemm2_op));
 
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm1_op));
-  ops.reserve(intermediate_ops.size());
+  // d_Q_accum @ identity -> dQ
+  TF_ASSIGN_OR_RETURN(
+      auto identity_desc,
+      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_IDENTITY));
+  TF_ASSIGN_OR_RETURN(
+      auto identity_op,
+      CreateUnaryPwOp(tensor_d_Q_accum, tensor_dq, identity_desc));
+  intermediate_ops.push_back(std::move(identity_op));
 
+  ops.reserve(intermediate_ops.size());
   for (auto& intermediate_op : intermediate_ops) {
     ops.emplace_back(&intermediate_op);
   }
@@ -6078,13 +7330,18 @@ GetCudnnFusedMHABackwardOperationGraph(
   RETURN_MSG_IF_CUDNN_ERROR(op_graph);
 
   VLOG(4) << "\nTensor_q: " << tensor_q.describe()
-          << "\nTensor_k: " << tensor_k.describe()
+          << "\nTensor_kt: " << tensor_kt.describe()
           << "\nTensor_p: " << tensor_p.describe()
           << "\nTensor_vt: " << tensor_vt.describe()
           << "\nTensor_do: " << tensor_do.describe()
+          << "\nTensor_o: " << tensor_o.describe()
           << "\nTensor_dq: " << tensor_dq.describe()
           << "\nTensor_dk: " << tensor_dk.describe()
           << "\nTensor_dv: " << tensor_dv.describe()
+          << "\nBMM2_grad_gemm1: " << bmm2_grad_gemm1_desc.describe()
+          << "\nBMM2_grad_gemm2: " << bmm2_grad_gemm2_desc.describe()
+          << "\nBMM1_grad_gemm1: " << bmm1_grad_gemm1_desc.describe()
+          << "\nBMM1_grad_gemm2: " << bmm1_grad_gemm2_desc.describe()
           << "\nOpGraph: " << op_graph.describe();
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
@@ -6721,10 +7978,10 @@ class CudnnExecutionPlanRunner<void(Args...)>
     auto cudnn = cudnn_->GetHandle(parent_, stream);
 
     size_t workspace_size = plan_.getWorkspaceSize();
+
     RETURN_MSG_IF_CUDNN_ERROR(plan_);
     bool should_add_scalars =
         !scalar_input_uids_.empty() && !scalar_input_values_.empty();
-    RETURN_MSG_IF_CUDNN_ERROR(plan_);
 
     std::vector<int64_t> data_uids_vec = {data_uids_.cbegin(),
                                           data_uids_.cend()};
@@ -6754,9 +8011,8 @@ class CudnnExecutionPlanRunner<void(Args...)>
       data_ptrs_vec.pop_back();
     }
 
-    if (sizeof...(Args) == 7 || sizeof...(Args) == 11) {
-      // is fused attention fwd and bwd
-      // remove empty buffers from the list
+    if (sizeof...(Args) == 7 || sizeof...(Args) == 15) {
+      // is attention fwd or bwd
       data_ptrs_vec.erase(
           std::remove(data_ptrs_vec.begin(), data_ptrs_vec.end(), nullptr),
           data_ptrs_vec.end());
@@ -6777,15 +8033,22 @@ class CudnnExecutionPlanRunner<void(Args...)>
       initial_offset_ += offset_increment_;
       data_uids_vec.push_back(CudnnfMHAUid::D_SEED_ID);
       data_uids_vec.push_back(CudnnfMHAUid::D_OFFSET_ID);
-      data_ptrs_vec.push_back((void*)(&rng_seed_));
-      data_ptrs_vec.push_back((void*)(&initial_offset_));
+      if (is_flash_attention_ && CUDNN_VERSION < 8903) {
+        // flash attention for cuDNN < 8.9.3 only supports dev pointer for seed
+        // and offset
+        data_ptrs_vec.push_back(scratch_memory.opaque());
+        data_ptrs_vec.push_back(static_cast<void*>(
+            static_cast<int64_t*>(scratch_memory.opaque()) + 1));
+      } else {
+        data_ptrs_vec.push_back((void*)(&rng_seed_));
+        data_ptrs_vec.push_back((void*)(&initial_offset_));
+      }
 #else
       return absl::UnimplementedError(
           "Cudnn dropout offset and seed are only supported with Cudnn >= "
           "8.8.");
 #endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
     }
-
     auto variantPack =
         cudnn_frontend::VariantPackBuilder()
             .setWorkspacePointer(scratch_memory.opaque())
@@ -6793,7 +8056,6 @@ class CudnnExecutionPlanRunner<void(Args...)>
             .setUids(data_uids_vec.size(), data_uids_vec.data())
             .build();
     RETURN_MSG_IF_CUDNN_ERROR(variantPack);
-
     VLOG(4) << "\nDo cudnn execution plan with plan tag: " << plan_.getTag()
             << "\nWorkspace size in bytes: " << workspace_size
             << "\nVariantPack: " << variantPack.describe();
@@ -6803,6 +8065,16 @@ class CudnnExecutionPlanRunner<void(Args...)>
         std::optional<GpuTimer> timer,
         GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_profiling));
 
+    if (sizeof...(Args) == 15) {
+      // is training
+      if (is_flash_attention_) {
+        // should memset dq_accum because it is being atomic added
+        std::vector<DeviceMemoryBase> dev_mem{inputs...};
+        DeviceMemoryBase* dev_dq_accum = &(dev_mem[10]);
+        stream->ThenMemZero(dev_dq_accum, dev_dq_accum->size());
+      }
+    }
+
     cudnnStatus_t status = cudnnBackendExecute(
         cudnn.handle(), plan_.get_raw_desc(), variantPack.get_raw_desc());
     RETURN_IF_CUDNN_ERROR(status);
@@ -6837,7 +8109,8 @@ class CudnnExecutionPlanRunner<void(Args...)>
              {},
              {},
              0,
-             0}};
+             0,
+             false}};
   }
 
   static tsl::StatusOr<CudnnExecutionPlanRunner> Create(
@@ -6846,12 +8119,13 @@ class CudnnExecutionPlanRunner<void(Args...)>
       bool need_side_input, bool has_activation_output,
       std::vector<int64_t> scalar_input_uids,
       std::vector<ScalingParam> scalar_input_values, int64_t dropout_rng_seed,
-      int64_t dropout_rng_offset) {
+      int64_t dropout_rng_offset, bool is_flash_attention) {
     auto workspace_size = static_cast<uint64_t>(plan.getWorkspaceSize());
     RETURN_MSG_IF_CUDNN_ERROR(plan);
     return {{parent, cudnn, std::move(plan), workspace_size, uids,
              need_side_input, has_activation_output, scalar_input_uids,
-             scalar_input_values, dropout_rng_seed, dropout_rng_offset}};
+             scalar_input_values, dropout_rng_seed, dropout_rng_offset,
+             is_flash_attention}};
   }
 
  private:
@@ -6862,7 +8136,8 @@ class CudnnExecutionPlanRunner<void(Args...)>
                            bool has_activation_output,
                            std::vector<int64_t> scalar_input_uids,
                            std::vector<ScalingParam> scalar_input_values,
-                           int64_t dropout_rng_seed, int64_t dropout_rng_offset)
+                           int64_t dropout_rng_seed, int64_t dropout_rng_offset,
+                           bool is_flash_attention)
       : parent_(parent),
         cudnn_(cudnn),
         plan_(std::move(plan)),
@@ -6873,7 +8148,8 @@ class CudnnExecutionPlanRunner<void(Args...)>
         scalar_input_uids_(scalar_input_uids),
         scalar_input_values_(scalar_input_values),
         offset_increment_(dropout_rng_offset),
-        rng_seed_(dropout_rng_seed) {}
+        rng_seed_(dropout_rng_seed),
+        is_flash_attention_(is_flash_attention) {}
   GpuExecutor* parent_;
   CudnnAccess* cudnn_;
   cudnn_frontend::ExecutionPlan plan_;
@@ -6887,6 +8163,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
   mutable int64_t initial_offset_ = 0;
   int64_t offset_increment_ = 0;
   int64_t rng_seed_;
+  bool is_flash_attention_;
 };
 #endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 
@@ -7740,20 +9017,31 @@ CudnnSupport::FusedMHARunnerFromDesc(
     std::optional<dnn::TensorDescriptor> activation_descriptor,
     std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    bool is_flash_attention, bool is_causal_mask) {
 #if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
   std::vector<int64_t> intermediate_shape;
   TF_ASSIGN_OR_RETURN(
       auto op_graph,
-      GetCudnnFusedMHAOperationGraph(
-          bmm1_lhs_descriptor, bmm1_rhs_descriptor, bmm2_rhs_descriptor,
-          intermediate_bmm2_lhs_descriptor, output_descriptor, mask_descriptor,
-          bias_descriptor, activation_descriptor, kind, dropout_rate, seed,
-          cudnn, scale, intermediate_shape, use_dropout,
-          /*use_mask*/ mask_descriptor != std::nullopt,
-          /*use_bias*/ bias_descriptor != std::nullopt));
+      is_flash_attention
+          ? GetCudnnFlashAttentionOperationGraph(
+                bmm1_lhs_descriptor, bmm1_rhs_descriptor, bmm2_rhs_descriptor,
+                intermediate_bmm2_lhs_descriptor, output_descriptor,
+                mask_descriptor, bias_descriptor, activation_descriptor, kind,
+                dropout_rate, seed, cudnn, scale, intermediate_shape,
+                use_dropout,
+                /*use_mask*/ mask_descriptor != std::nullopt,
+                /*use_bias*/ bias_descriptor != std::nullopt, is_causal_mask)
+          : GetCudnnFusedMHAOperationGraph(
+                bmm1_lhs_descriptor, bmm1_rhs_descriptor, bmm2_rhs_descriptor,
+                intermediate_bmm2_lhs_descriptor, output_descriptor,
+                mask_descriptor, bias_descriptor, activation_descriptor, kind,
+                dropout_rate, seed, cudnn, scale, intermediate_shape,
+                use_dropout,
+                /*use_mask*/ mask_descriptor != std::nullopt,
+                /*use_bias*/ bias_descriptor != std::nullopt));
 
   TF_ASSIGN_OR_RETURN(auto execution_plan,
                       GetExecPlanFromHeuristics(std::move(*op_graph), cudnn));
@@ -7771,19 +9059,46 @@ CudnnSupport::FusedMHARunnerFromDesc(
     u_ids.push_back(CudnnfMHAUid::P_ID);
   }
 
-  ScalingParam alpha_scale(scale, bmm1_lhs_descriptor.type());
-  std::vector<ScalingParam> scalar_input_values = {alpha_scale};
-  std::vector<int64_t> scalar_input_uids = {CudnnfMHAUid::ALPHA_SCALE_ID};
+  std::vector<ScalingParam> scalar_input_values;
+  std::vector<int64_t> scalar_input_uids;
+
+  int64_t dropout_rng_seed = seed == std::nullopt ? 0 : *seed;
   int64_t dropout_rng_offset = 0;
 
-  if (use_dropout) {
+  if (is_flash_attention) {
+    ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
+    scalar_input_values = {alpha_scale};
+    scalar_input_uids = {CudnnfMHAUid::ALPHA_SCALE_ID};
     scalar_input_uids.push_back(CudnnfMHAUid::DROPOUT_SCALE_ID);
-    double dropout_scale_value = (1.0 / (1.0 - *dropout_rate));
-    ScalingParam dropout_scale(dropout_scale_value, bmm1_lhs_descriptor.type());
+    // before 8.9.3 it should be half/bf16, after 8.9.3, it could be any type,
+    // use fp32 here
+    double dropout_scale_value =
+        use_dropout ? (1.0f / (1.0f - *dropout_rate)) : 1.0f;
+    ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
     scalar_input_values.push_back(dropout_scale);
     dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+
+    if (bias_descriptor == std::nullopt) {
+      // push negative infinity here
+      scalar_input_uids.push_back(CudnnfMHAUid::NEG_INFINITY_ID);
+      double negative_infinity_value = -std::numeric_limits<float>::infinity();
+      ScalingParam negative_infinity(negative_infinity_value,
+                                     dnn::DataType::kFloat);
+      scalar_input_values.push_back(negative_infinity);
+    }
+  } else {
+    ScalingParam alpha_scale(scale, bmm1_lhs_descriptor.type());
+    scalar_input_values = {alpha_scale};
+    scalar_input_uids = {CudnnfMHAUid::ALPHA_SCALE_ID};
+    if (use_dropout) {
+      scalar_input_uids.push_back(CudnnfMHAUid::DROPOUT_SCALE_ID);
+      double dropout_scale_value = 1.0f / (1.0f - *dropout_rate);
+      ScalingParam dropout_scale(dropout_scale_value,
+                                 bmm1_lhs_descriptor.type());
+      scalar_input_values.push_back(dropout_scale);
+      dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+    }
   }
-  int64_t dropout_rng_seed = seed == std::nullopt ? 0 : *seed;
 
   TF_ASSIGN_OR_RETURN(
       auto runner,
@@ -7792,8 +9107,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
           /*need_side_input*/ true,
           /*has_activation_output*/ (activation_descriptor != std::nullopt),
           scalar_input_uids, scalar_input_values, dropout_rng_seed,
-          dropout_rng_offset));
-
+          dropout_rng_offset, is_flash_attention));
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::FusedMHASignature>>(
       std::move(runner))};
 #else
@@ -7814,74 +9128,128 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
     const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
     const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
     const dnn::TensorDescriptor& d_bmm2_rhs_descriptor,
-    const dnn::TensorDescriptor& d_s_descriptor,
+    std::optional<dnn::TensorDescriptor> d_s_descriptor,
     std::optional<dnn::TensorDescriptor> mask_descriptor,
-    std::optional<dnn::TensorDescriptor> d_bias_descriptor, double scale,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
-#if (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
+    std::optional<dnn::TensorDescriptor> d_bias_descriptor,
+    std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
+    std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    bool is_flash_attention, bool is_causal_mask) {
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
+  std::vector<int64_t> intermediate_shape;
   TF_ASSIGN_OR_RETURN(
       auto op_graph,
-      GetCudnnFusedMHABackwardOperationGraph(
-          bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
-          bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
-          d_output_descriptor, d_s_descriptor, d_bmm1_lhs_descriptor,
-          d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor, kind, dropout_rate,
-          seed, cudnn, scale, use_dropout,
-          /*use_mask*/ mask_descriptor != std::nullopt,
-          /*use_bias*/ d_bias_descriptor != std::nullopt));
+      is_flash_attention
+          ? GetCudnnFlashAttentionBackwardOperationGraph(
+                bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
+                bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
+                d_output_descriptor, d_bmm1_lhs_descriptor,
+                d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor, kind,
+                dropout_rate, seed, cudnn, scale, intermediate_shape,
+                use_dropout,
+                /*use_mask*/ mask_descriptor != std::nullopt,
+                /*use_bias*/ bias_descriptor != std::nullopt, is_causal_mask)
+          : GetCudnnFusedMHABackwardOperationGraph(
+                bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
+                bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
+                d_output_descriptor, d_bmm1_lhs_descriptor,
+                d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor, kind,
+                dropout_rate, seed, cudnn, scale, intermediate_shape,
+                use_dropout,
+                /*use_mask*/ mask_descriptor != std::nullopt,
+                /*use_bias*/ d_bias_descriptor != std::nullopt));
+  // The function  GetExecPlanFromHeuristics uses
+  // cudnn_frontend::cudnnException which is currently not recommended for
+  // use by Google. Hence commenting out the call.
+  // TODO - Create a status wrapper to wrap the exception to avoid it being
+  // exposed to the runtime. TF_ASSIGN_OR_RETURN(auto execution_plan,
+  //                       GetExecPlanFromHeuristics(std::move(*op_graph),
+  //                       cudnn));
 
   TF_ASSIGN_OR_RETURN(auto execution_plan,
                       GetExecPlanFromHeuristics(std::move(*op_graph), cudnn));
 
-  std::vector<int64_t> scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID,
-                                      CudnnfMHAUid::ZERO_VAL_ID,
-                                      CudnnfMHAUid::ONE_VAL_ID};
-  ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
-  double zero_value = 0.0f;
-  ScalingParam zero(zero_value, dnn::DataType::kFloat);
-  double one_value = 1.0f;
-  ScalingParam one(one_value, dnn::DataType::kFloat);
-  std::vector<ScalingParam> scalar_values = {alpha_scale, zero, one};
-
-  // TODO cudnn doesn't support no dropout, so setting dropout rate to 0
-  // here to mimic no dropout. Change this when cudnn graph is more
-  // flexible.
-  scalar_uids.push_back(CudnnfMHAUid::DROPOUT_SCALE_ID);
-  double dropout_scale_value =
-      use_dropout ? (1.0 / (1.0 - *dropout_rate)) : 1.0;
-  ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
-  scalar_values.push_back(dropout_scale);
   int64_t dropout_rng_seed = seed == std::nullopt ? 0 : *seed;
-
-  std::vector<int64_t> uids = {
-      CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
-      CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
-      CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, CudnnfMHAUid::dS_ID};
-  if (mask_descriptor != std::nullopt) {
-    uids.push_back(CudnnfMHAUid::MASK_ID);
-  }
-  if (d_bias_descriptor != std::nullopt) {
-    uids.push_back(CudnnfMHAUid::dBIAS_ID);
+  int64_t dropout_rng_offset = 0;
+  std::vector<int64_t> scalar_uids;
+  std::vector<ScalingParam> scalar_values;
+  std::vector<int64_t> uids;
+
+  if (is_flash_attention) {
+    scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::DROPOUT_SCALE_ID,
+                   CudnnfMHAUid::SCALE_PROB_ID};
+    // alpha scale
+    ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
+    // dropout scale
+    double dropout_scale_value =
+        use_dropout ? (1.0f / (1.0f - *dropout_rate)) : 1.0f;
+    ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
+    // scale prob
+    double scale_prob_value = 1.0 - *dropout_rate;
+    ScalingParam scale_prob(scale_prob_value, dnn::DataType::kFloat);
+    scalar_values = {alpha_scale, dropout_scale, scale_prob};
+    // push dropout seed and offset here
+    dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+    uids = {
+        CudnnfMHAUid::Q_ID,         CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
+        CudnnfMHAUid::V_ID,         CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
+        CudnnfMHAUid::dK_ID,        CudnnfMHAUid::dV_ID, CudnnfMHAUid::S_SUM_ID,
+        CudnnfMHAUid::d_Q_accum_ID, CudnnfMHAUid::O_ID};
+    if (bias_descriptor != std::nullopt) {
+      uids.push_back(CudnnfMHAUid::BIAS_ID);
+    } else {
+      // is causal mask
+      // negative infinity
+      double negative_infinity_value = -std::numeric_limits<float>::infinity();
+      ScalingParam negative_infinity(negative_infinity_value,
+                                     dnn::DataType::kFloat);
+      scalar_values.push_back(negative_infinity);
+      scalar_uids.push_back(CudnnfMHAUid::NEG_INFINITY_ID);
+    }
+  } else {
+    // TODO cudnn doesn't support no dropout, so setting dropout rate to 0 here
+    // to mimic no dropout. Change this when cudnn graph is more flexible.
+    scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::ZERO_VAL_ID,
+                   CudnnfMHAUid::ONE_VAL_ID, CudnnfMHAUid::DROPOUT_SCALE_ID};
+    ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
+    double zero_value = 0.0f;
+    ScalingParam zero(zero_value, dnn::DataType::kFloat);
+    double one_value = 1.0f;
+    ScalingParam one(one_value, dnn::DataType::kFloat);
+    double dropout_scale_value =
+        use_dropout ? (1.0 / (1.0 - *dropout_rate)) : 1.0;
+    ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
+    scalar_values = {alpha_scale, zero, one, dropout_scale};
+
+    uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
+            CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
+            CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, CudnnfMHAUid::dS_ID};
+    if (mask_descriptor != std::nullopt) {
+      uids.push_back(CudnnfMHAUid::MASK_ID);
+    }
+    if (d_bias_descriptor != std::nullopt) {
+      uids.push_back(CudnnfMHAUid::dBIAS_ID);
+    }
   }
-
   TF_ASSIGN_OR_RETURN(
       auto runner,
       CudnnExecutionPlanRunner<dnn::FusedMHABackwardSignature>::Create(
           parent_, cudnn_.get(), std::move(execution_plan), uids,
           /*need_side_input*/ true, /*has_activation_output*/ false,
           scalar_uids, scalar_values, dropout_rng_seed,
-          /*dropout_rng_offset*/ 0));
+          /*dropout_rng_offset*/ dropout_rng_offset,
+          /*is_flash_attention*/ is_flash_attention));
   return {std::make_unique<
       CudnnExecutionPlanRunner<dnn::FusedMHABackwardSignature>>(
       std::move(runner))};
 #else
   return absl::UnimplementedError(
-      "Cudnn execution plans with mask input in bwd are only supported with "
-      "Cudnn >= 8.9.1");
-#endif  // CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND
+      "Cudnn execution plans with dbias calculation in bwd are only "
+      "supported with Cudnn >= 8.8.");
+#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
 }
 
 bool CudnnSupport::GetRnnAlgorithms(
@@ -8648,7 +10016,9 @@ bool CudnnSupport::DoMatMul(Stream* stream,
     if (!stream
              ->ThenBlasGemm(blas::Transpose::kNoTranspose,
                             blas::Transpose::kNoTranspose, m, n, k, weights, m,
-                            input_data, k, output_data, m, NumericOptions{})
+                            input_data, k, output_data, m, NumericOptions{},
+                            blas::CallContext::kNone)
+
              .ok()) {
       return false;
     }
@@ -8728,10 +10098,10 @@ bool CudnnSupport::DoMatMul(Stream* stream,
       return ptrs;
     };
 
-    stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose,
-                                blas::Transpose::kNoTranspose, m, n, k, alpha,
-                                toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c),
-                                ldc, batch_count, NumericOptions{});
+    stream->ThenBlasGemmBatched(
+        blas::Transpose::kNoTranspose, blas::Transpose::kNoTranspose, m, n, k,
+        alpha, toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c), ldc,
+        batch_count, NumericOptions{}, blas::CallContext::kNone);
   }
 
   return stream->ok();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 1157093b53bedc..f662ecf83f445a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -307,7 +307,8 @@ class CudnnSupport : public dnn::DnnSupport {
       std::optional<dnn::TensorDescriptor> activation_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask) override;
 
   tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
@@ -321,10 +322,13 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
       const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
       const dnn::TensorDescriptor& d_bmm2_rhs_descriptor,
-      const dnn::TensorDescriptor& d_s_descriptor,
+      std::optional<dnn::TensorDescriptor> d_s_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> d_bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+      std::optional<dnn::TensorDescriptor> d_bias_descriptor,
+      std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask);
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
@@ -449,32 +453,6 @@ class CudnnSupport : public dnn::DnnSupport {
       std::optional<const DeviceMemory<float>> bias_input,
       std::optional<DeviceMemory<float>> bias_output) override;
 
-  bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int8_t>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) override {
-    LOG(ERROR) << "DoConvolveQuantized not supported by cuDNN";
-    return false;
-  }
-
-  bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int16>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) override {
-    LOG(ERROR) << "DoConvolveQuantized not supported by cuDNN";
-    return false;
-  }
-
   bool DoSeparableConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
       const DeviceMemory<float>& input_data,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 1c0e48a93441fd..bf16b27f56ca73 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
@@ -1591,21 +1590,15 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   ScopedActivateContext activation(context);
   CUresult result;
 
-  // Check if the stream is doing graph capture.
-  cudaStreamCaptureStatus stream_capture_status;
-  cudaError_t err =
-      cudaStreamGetCaptureInfo(stream, &stream_capture_status, /*pId=*/nullptr);
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "Failed to get stream capture info: "
-               << cudaGetErrorString(err);
-    return false;
-  }
-
   // In graph capture mode we never have operations that access peer memory, so
   // we can always make a call to cuMemcpyDtoDAsync.
-  bool is_capturing = stream_capture_status == cudaStreamCaptureStatusActive;
+  tsl::StatusOr<bool> is_capturing = StreamIsCapturing(stream);
+  if (!is_capturing.ok()) {
+    LOG(ERROR) << is_capturing.status().message();
+    return false;
+  }
 
-  if ((gpu_dst == 0 || gpu_src == 0) || is_capturing) {
+  if ((gpu_dst == 0 || gpu_src == 0) || (*is_capturing)) {
     // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
     // This happens when the size is 0.
     result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 7fa299f3849ab5..f978c3d77f1b4a 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -68,6 +68,18 @@ struct CudaComputeCapability {
     return !(*this < CudaComputeCapability{other_major, other_minor});
   }
 
+  bool IsAtLeastVolta() const {
+    return major >= CudaComputeCapabilities::VOLTA;
+  }
+
+  bool IsAtLeastAmpere() const {
+    return major >= CudaComputeCapabilities::AMPERE;
+  }
+
+  bool IsAtLeastHopper() const {
+    return major >= CudaComputeCapabilities::HOPPER;
+  }
+
   bool operator<(const CudaComputeCapability &other) const {
     return ToPair() < other.ToPair();
   }
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 4262e150371126..0b2f6d77b92220 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -229,7 +229,8 @@ DnnSupport::FusedMHARunnerFromDesc(
     std::optional<dnn::TensorDescriptor> activation_descriptor,
     std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    bool is_flash_attention, bool is_causal_mask) {
   return absl::UnimplementedError("FusedMHARunnerFromDesc not implemented.");
 }
 
@@ -245,10 +246,13 @@ DnnSupport::FusedMHABackwardRunnerFromDesc(
     const TensorDescriptor& d_bmm1_lhs_descriptor,
     const TensorDescriptor& d_bmm1_rhs_descriptor,
     const TensorDescriptor& d_bmm2_rhs_descriptor,
-    const TensorDescriptor& d_s_descriptor,
+    std::optional<dnn::TensorDescriptor> d_s_descriptor,
     std::optional<dnn::TensorDescriptor> mask_descriptor,
-    std::optional<dnn::TensorDescriptor> d_bias_descriptor, double scale,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    std::optional<dnn::TensorDescriptor> d_bias_descriptor,
+    std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
+    std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    bool is_flash_attention, bool is_causal_mask) {
   return absl::UnimplementedError(
       "FusedMHABackwardRunnerFromDesc not implemented.");
 }
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 16da8d31735071..8e92fd59ef13e6 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -146,15 +146,6 @@ enum class RnnDirectionMode {
   kRnnBidirectional = 1,
 };
 
-// Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
-// performing depth to space and the read layout when performing space to depth.
-// It's specified with most-major dimension first and most-minor dimension last.
-// In DepthToSpace, the D*M^2 values are read in and then, for DepthHeightWidth,
-// written out to the output patch, by varying first width, then height, then
-// depth. In C array format, it looks like [depth][height][width]. See
-// DepthToSpace comment for more information.
-enum class DepthToSpaceLayout { DepthHeightWidth };
-
 class TensorDescriptor {
  public:
   TensorDescriptor() = default;
@@ -1004,8 +995,11 @@ using FusedMHABackwardSignature = void(
     DeviceMemoryBase /* d_output_data */,
     DeviceMemoryBase /* d_BMM1_inputA_data */,
     DeviceMemoryBase /* d_BMM1_inputB_data */,
-    DeviceMemoryBase /* d_BMM2_inputB_data */, DeviceMemoryBase /* d_s_data */,
-    DeviceMemoryBase /* mask_data */, DeviceMemoryBase /* d_bias_data */);
+    DeviceMemoryBase /* d_BMM2_inputB_data */, DeviceMemoryBase /* d_S_data */,
+    DeviceMemoryBase /* softmax_sum_data */,
+    DeviceMemoryBase /* d_Q_accum_data */, DeviceMemoryBase /* mask_data */,
+    DeviceMemoryBase /* d_bias_data */, DeviceMemoryBase /* fwd_output_data */,
+    DeviceMemoryBase /* bias_data */);
 using FusedMHABackwardRunner = OpRunner<FusedMHABackwardSignature>;
 
 // Describes the configuration for the algorithms that will used.
@@ -1666,7 +1660,8 @@ class DnnSupport {
       std::optional<dnn::TensorDescriptor> activation_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask);
 
   virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
@@ -1680,10 +1675,13 @@ class DnnSupport {
       const TensorDescriptor& d_bmm1_lhs_descriptor,
       const TensorDescriptor& d_bmm1_rhs_descriptor,
       const TensorDescriptor& d_bmm2_rhs_descriptor,
-      const TensorDescriptor& d_s_descriptor,
+      std::optional<dnn::TensorDescriptor> d_s_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> d_bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+      std::optional<dnn::TensorDescriptor> d_bias_descriptor,
+      std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask);
 
   virtual bool GetMIOpenConvolveAlgorithms(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
@@ -1699,32 +1697,6 @@ class DnnSupport {
   // Returns a list of supported rnn algorithms.
   virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
 
-  // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
-  // coefficient_scales specifies the scaling of each column of coefficients:
-  // original float coefficient[row * num_columns + column] =
-  //     quantized coefficient[row * num_columns + column] *
-  //     coefficient_scales[column].
-  virtual bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int8_t>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) = 0;
-
-  // Same as DoConvolveQuantized above, but int8 filter coefficients.
-  virtual bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int16>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) = 0;
-
   // Variation of the above with the weight matrix split into two matrices.
   // first_weights: Coefficients of the first matrix.
   // second_weights: Coefficients of the second matrix.
@@ -1968,35 +1940,6 @@ class DnnSupport {
       absl::Span<const DeviceMemory<float>* const> input_data,
       DeviceMemory<float>* output_data) = 0;
 
-  // Depth to space takes an X by Y image with depth D*M^2 and changes it to an
-  // MX x MY image with depth D. Each input location (x,y) with depth D*M^2 in
-  // the input image is changed to an MxM contiguous area in the output image,
-  // with the values being laid out in the raster order by DepthToSpaceLayout,
-  // and will have a new depth of D.
-  //
-  // Example.
-  // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4,  Dout=2
-  // DepthHeightWidth layout
-  // Values within a 'cell' are at different depths and same x & y.
-  // Input:
-  // abcdefgh  ijklmnop
-  // qrstuvwx  yz012345
-  // Output:
-  // ae bf im jn
-  // cg dh ko lp
-  // qu rv y2 z3
-  // sw tx 04 15
-  //
-  // sqrt_depth_reduction: 'M' in the comment above
-  virtual bool DoDepthToSpace(Stream* stream,
-                              const dnn::BatchDescriptor& input_dimensions,
-                              const DeviceMemory<float>& input_data,
-                              const DepthToSpaceLayout& depth_to_space_layout,
-                              const int& sqrt_depth_reduction,
-                              DeviceMemory<float>* output_data) {
-    return false;
-  }
-
   // Computes the specified operation (e.g. addition or multiplication)
   // between corresponding elements in the inputs and stores the result in the
   // output element.
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index f133714d7fbced..cfb617119083d9 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -100,6 +100,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
index 5f07de357ab206..8818808029bf41 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -27,30 +27,36 @@ limitations under the License.
 #include "tsl/platform/tensor_float_32_utils.h"
 #endif
 
-namespace stream_executor::gpu {
+namespace stream_executor {
 
-tsl::StatusOr<blas::DataType> AsBlasDataType(xla::PrimitiveType dtype) {
+namespace gpu {
+
+using blas::ComputationType;
+using blas::DataType;
+using xla::PrimitiveType;
+
+tsl::StatusOr<DataType> AsBlasDataType(PrimitiveType dtype) {
   switch (dtype) {
-    case xla::PrimitiveType::F8E5M2:
-      return blas::DataType::kF8E5M2;
-    case xla::PrimitiveType::F8E4M3FN:
-      return blas::DataType::kF8E4M3FN;
-    case xla::PrimitiveType::S8:
-      return blas::DataType::kInt8;
-    case xla::PrimitiveType::F16:
-      return blas::DataType::kHalf;
-    case xla::PrimitiveType::BF16:
-      return blas::DataType::kBF16;
-    case xla::PrimitiveType::F32:
-      return blas::DataType::kFloat;
-    case xla::PrimitiveType::S32:
-      return blas::DataType::kInt32;
-    case xla::PrimitiveType::F64:
-      return blas::DataType::kDouble;
-    case xla::PrimitiveType::C64:
-      return blas::DataType::kComplexFloat;
-    case xla::PrimitiveType::C128:
-      return blas::DataType::kComplexDouble;
+    case PrimitiveType::F8E5M2:
+      return DataType::kF8E5M2;
+    case PrimitiveType::F8E4M3FN:
+      return DataType::kF8E4M3FN;
+    case PrimitiveType::S8:
+      return DataType::kInt8;
+    case PrimitiveType::F16:
+      return DataType::kHalf;
+    case PrimitiveType::BF16:
+      return DataType::kBF16;
+    case PrimitiveType::F32:
+      return DataType::kFloat;
+    case PrimitiveType::S32:
+      return DataType::kInt32;
+    case PrimitiveType::F64:
+      return DataType::kDouble;
+    case PrimitiveType::C64:
+      return DataType::kComplexFloat;
+    case PrimitiveType::C128:
+      return DataType::kComplexDouble;
     default:
       return xla::InternalError(
           "AsBlasDataType: unsupported type: %s",
@@ -58,59 +64,59 @@ tsl::StatusOr<blas::DataType> AsBlasDataType(xla::PrimitiveType dtype) {
   }
 }
 
-tsl::StatusOr<xla::PrimitiveType> AsXlaPrimitiveType(blas::DataType dtype) {
+tsl::StatusOr<PrimitiveType> AsXlaPrimitiveType(DataType dtype) {
   switch (dtype) {
-    case blas::DataType::kF8E5M2:
-      return xla::PrimitiveType::F8E5M2;
-    case blas::DataType::kF8E4M3FN:
-      return xla::PrimitiveType::F8E4M3FN;
-    case blas::DataType::kInt8:
-      return xla::PrimitiveType::S8;
-    case blas::DataType::kHalf:
-      return xla::PrimitiveType::F16;
-    case blas::DataType::kBF16:
-      return xla::PrimitiveType::BF16;
-    case blas::DataType::kFloat:
-      return xla::PrimitiveType::F32;
-    case blas::DataType::kInt32:
-      return xla::PrimitiveType::S32;
-    case blas::DataType::kDouble:
-      return xla::PrimitiveType::F64;
-    case blas::DataType::kComplexFloat:
-      return xla::PrimitiveType::C64;
-    case blas::DataType::kComplexDouble:
-      return xla::PrimitiveType::C128;
+    case DataType::kF8E5M2:
+      return PrimitiveType::F8E5M2;
+    case DataType::kF8E4M3FN:
+      return PrimitiveType::F8E4M3FN;
+    case DataType::kInt8:
+      return PrimitiveType::S8;
+    case DataType::kHalf:
+      return PrimitiveType::F16;
+    case DataType::kBF16:
+      return PrimitiveType::BF16;
+    case DataType::kFloat:
+      return PrimitiveType::F32;
+    case DataType::kInt32:
+      return PrimitiveType::S32;
+    case DataType::kDouble:
+      return PrimitiveType::F64;
+    case DataType::kComplexFloat:
+      return PrimitiveType::C64;
+    case DataType::kComplexDouble:
+      return PrimitiveType::C128;
     default:
       return xla::InternalError("AsXlaPrimitiveType: unsupported dtype");
   }
 }
 
-tsl::StatusOr<blas::ComputationType> GetBlasComputationType(
-    xla::PrimitiveType lhs_dtype, xla::PrimitiveType output_dtype,
+tsl::StatusOr<ComputationType> GetBlasComputationType(
+    PrimitiveType lhs_dtype, PrimitiveType output_dtype,
     int64_t compute_precision) {
   switch (output_dtype) {
-    case xla::PrimitiveType::F8E5M2:    // fall-through
-    case xla::PrimitiveType::F8E4M3FN:  // fall-through
-    case xla::PrimitiveType::F16:       // fall-through
-    case xla::PrimitiveType::BF16:
+    case PrimitiveType::F8E5M2:    // fall-through
+    case PrimitiveType::F8E4M3FN:  // fall-through
+    case PrimitiveType::F16:       // fall-through
+    case PrimitiveType::BF16:
       // Accumulate in f32 precision.
-      return blas::ComputationType::kF32;
-    case xla::PrimitiveType::F32:  // fall-through
-    case xla::PrimitiveType::C64:
+      return ComputationType::kF32;
+    case PrimitiveType::F32:  // fall-through
+    case PrimitiveType::C64:
 #if GOOGLE_CUDA
       if (tsl::tensor_float_32_execution_enabled() && compute_precision <= 1 &&
           lhs_dtype == output_dtype) {
         // CublasLt requires compute type to be F32 for F8 matmul.
         // TF32 should only be chosen for FP32 or C64 gemm
-        return blas::ComputationType::kTF32AsF32;
+        return ComputationType::kTF32AsF32;
       }
 #endif
-      return blas::ComputationType::kF32;
-    case xla::PrimitiveType::F64:  // fall-through
-    case xla::PrimitiveType::C128:
-      return blas::ComputationType::kF64;
-    case xla::PrimitiveType::S32:
-      return blas::ComputationType::kI32;
+      return ComputationType::kF32;
+    case PrimitiveType::F64:  // fall-through
+    case PrimitiveType::C128:
+      return ComputationType::kF64;
+    case PrimitiveType::S32:
+      return ComputationType::kI32;
     default:
       return xla::InternalError("GetBlasComputationType: unsupported type");
   }
@@ -149,12 +155,13 @@ bool MakeOutputColumnMajor(MatrixLayout& lhs, MatrixLayout& rhs,
   return (blas != nullptr ? blas->GetBlasLt() : nullptr);
 }
 
-blas::DataType GetScaleType(blas::DataType c_type,
-                            blas::ComputationType computation_type) {
-  return ((computation_type == blas::ComputationType::kF32) &&
-          (c_type != blas::DataType::kComplexFloat))
-             ? blas::DataType::kFloat
-             : c_type;
+DataType GetScaleType(DataType c_type, ComputationType computation_type) {
+  return (computation_type == ComputationType::kF32 &&
+                  c_type != DataType::kComplexFloat
+              ? DataType::kFloat
+              : c_type);
 }
 
-}  // namespace stream_executor::gpu
+}  // namespace gpu
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index ab2412d1edbb9a..40adeec908c9ce 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -86,6 +86,8 @@ struct GemmConfig {  // plain GemmConfig which is extended with create functions
   double beta;
   int64_t compute_precision;
   std::optional<int64_t> algorithm;
+  bool grad_x;
+  bool grad_y;
   std::optional<blas::ComputationType> compute_type;
 };
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 45398f4e70d841..6a5e006bfbfa4e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
@@ -140,6 +141,11 @@ tsl::Status GpuCommandBuffer::Trace(
   return tsl::OkStatus();
 }
 
+absl::Span<GpuGraphNodeHandle> GpuCommandBuffer::GetDependencies() {
+  return nodes_.empty() ? absl::Span<GpuGraphNodeHandle>()
+                        : absl::Span<GpuGraphNodeHandle>(&nodes_.back(), 1);
+}
+
 tsl::Status GpuCommandBuffer::CheckNotFinalized() {
   if (state_ == State::kFinalized)
     return absl::InternalError(
@@ -167,11 +173,12 @@ tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
 
   // Adds a new kernel node to the graph under construction.
   if (state_ == State::kCreate) {
+    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
     return GpuDriver::GraphAddKernelNode(
-        node, graph_, {}, kernel.name(), gpu_func, blocks.x, blocks.y, blocks.z,
-        threads.x, threads.y, threads.z, args.number_of_shared_bytes(),
-        kernel_params, /*extra=*/nullptr);
+        node, graph_, deps, kernel.name(), gpu_func, blocks.x, blocks.y,
+        blocks.z, threads.x, threads.y, threads.z,
+        args.number_of_shared_bytes(), kernel_params, /*extra=*/nullptr);
   }
 
   // Updates kernel node in the executable graph.
@@ -193,9 +200,10 @@ tsl::Status GpuCommandBuffer::AddNestedCommandBuffer(
 
   // Adds a child graph node to the graph under construction.
   if (state_ == State::kCreate) {
+    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
     return GpuDriver::GraphAddChildNode(
-        node, graph_, {}, GpuCommandBuffer::Cast(&nested)->graph());
+        node, graph_, deps, GpuCommandBuffer::Cast(&nested)->graph());
   }
 
   return UnsupportedStateError(state_);
@@ -208,9 +216,10 @@ tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
 
   // Adds a new memcpy node to the graph under construction.
   if (state_ == State::kCreate) {
+    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
     return GpuDriver::GraphAddMemcpyD2DNode(parent_->gpu_context(), node,
-                                            graph_, {}, AsDevicePtr(*dst),
+                                            graph_, deps, AsDevicePtr(*dst),
                                             AsDevicePtr(src), size);
   }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index aade02cb557161..0c761f44bb9ec6 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
@@ -80,6 +81,11 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   }
 
  private:
+  // TODO(ezhulenev): Currently we serialize all Gpu nodes by adding a
+  // dependency between all nodes added to a command buffer. We need a concept
+  // of a barrier at a command buffer level.
+  absl::Span<GpuGraphNodeHandle> GetDependencies();
+
   // Returns OK status if command buffer is not finalized and it is still
   // possible to add new commands to it, otherwise returns internal error.
   tsl::Status CheckNotFinalized();
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index c5ae52317fe305..b5fdb64aba3e84 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -288,6 +288,9 @@ class GpuExecutor : public internal::StreamExecutorInterface {
     return it->second;
   }
 
+  int cc_major() const { return cc_major_; }
+  int cc_minor() const { return cc_minor_; }
+
  private:
   // Host callback landing routine invoked by CUDA.
   // data: User-provided callback provided to HostCallback() above, captured
diff --git a/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/xla/xla/stream_executor/lazy_op_runner.h
index 2c390d75f92148..609eb67d0a9a82 100644
--- a/third_party/xla/xla/stream_executor/lazy_op_runner.h
+++ b/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -244,6 +244,8 @@ struct FusedMHAOp {
     std::optional<TensorDescriptor> activation_descriptor;
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
+    bool is_flash_attention;
+    bool is_causal_mask;
   };
 
   static tsl::StatusOr<std::unique_ptr<const OpRunner<FusedMHASignature>>>
@@ -254,7 +256,8 @@ struct FusedMHAOp {
         config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
         config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
         config.activation_descriptor, config.mask_descriptor,
-        config.bias_descriptor, config.scale, config.dropout_rate, config.seed);
+        config.bias_descriptor, config.scale, config.dropout_rate, config.seed,
+        config.is_flash_attention, config.is_causal_mask);
   }
 };
 
@@ -272,11 +275,15 @@ struct FusedMHABackwardOp {
     const TensorDescriptor& d_bmm1_lhs_descriptor;
     const TensorDescriptor& d_bmm1_rhs_descriptor;
     const TensorDescriptor& d_bmm2_rhs_descriptor;
-    const TensorDescriptor& d_s_descriptor;
+    std::optional<TensorDescriptor> d_s_descriptor;
     std::optional<TensorDescriptor> mask_descriptor;
     std::optional<TensorDescriptor> d_bias_descriptor;
+    std::optional<TensorDescriptor> fwd_output_descriptor;
+    std::optional<TensorDescriptor> bias_descriptor;
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
+    bool is_flash_attention;
+    bool is_causal_mask;
   };
 
   static tsl::StatusOr<
@@ -290,8 +297,10 @@ struct FusedMHABackwardOp {
         config.bmm2_grad_gemm2_rhs_descriptor, config.d_output_descriptor,
         config.d_bmm1_lhs_descriptor, config.d_bmm1_rhs_descriptor,
         config.d_bmm2_rhs_descriptor, config.d_s_descriptor,
-        config.mask_descriptor, config.d_bias_descriptor, config.scale,
-        config.dropout_rate, config.seed);
+        config.mask_descriptor, config.d_bias_descriptor,
+        config.fwd_output_descriptor, config.bias_descriptor, config.scale,
+        config.dropout_rate, config.seed, config.is_flash_attention,
+        config.is_causal_mask);
   }
 };
 
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index 87981ddcf18921..6a20995b438a3f 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -21,8 +21,6 @@ limitations under the License.
 #include "rocm/rocm_config.h"
 #include "xla/primitive_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/rocm/hip_blas_utils.h"
 #include "xla/util.h"
 
 #if TF_HIPBLASLT
@@ -167,11 +165,12 @@ tsl::Status BlasLt::Init() {
   VLOG(2) << "BlasLt::MatmulDesc::Create compute_type" << int(compute_type)
           << " scale_type " << int(scale_type) << " epilogue " << int(epilogue)
           << " pointer_mode " << int(pointer_mode);
-  auto hip_scale_type_ = AsHipblasDataType(scale_type);        
+  auto hip_scale_type = AsHipblasDataType(scale_type);
+  auto hip_compute_type = AsHipblasComputeType(compute_type);
   SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
-      &hip_desc, AsHipblasComputeType(compute_type), hip_scale_type_));
+      &hip_desc, hip_compute_type, hip_scale_type));
   // Wrap hipblas handle immediately, so it is cleaned up if an error occurs.
-  BlasLt::MatmulDesc desc(hip_desc, hip_scale_type_);
+  BlasLt::MatmulDesc desc(hip_desc, hip_compute_type, hip_scale_type);
   if (pointer_mode != PointerMode::kHost) {
     return tsl::errors::Internal("hipblaslt does not support device pointers");
   }
@@ -466,25 +465,33 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
                            d_desc_.type()};
 
-#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)                \
-  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {       \
-    return gpu::BlasLt::MatmulPlan::DoMatmul<                               \
+#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)              \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {     \
+    return gpu::BlasLt::MatmulPlan::DoMatmul<                             \
         SCALENTYPE, HipToNativeT<ATYPE>::type, HipToNativeT<BTYPE>::type, \
         HipToNativeT<CTYPE>::type, HipToNativeT<DTYPE>::type>(            \
-        stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,     \
-        c_scale, d_scale, d_amax, algorithm, scratch_allocator,             \
-        profile_result);                                                    \
+        stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,   \
+        c_scale, d_scale, d_amax, algorithm, scratch_allocator,           \
+        profile_result);                                                  \
   }
 
   // Other data types:
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F)
-  TYPED_MATMUL(double, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F)
-  TYPED_MATMUL(complex64, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F)
-  TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F)
+  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B,
+               HIPBLASLT_R_16B)
+  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F,
+               HIPBLASLT_R_16F)
+  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_32F,
+               HIPBLASLT_R_32F)
+  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_32F,
+               HIPBLASLT_R_32F)
+  TYPED_MATMUL(float, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F,
+               HIPBLASLT_R_32F)
+  TYPED_MATMUL(double, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F,
+               HIPBLASLT_R_64F)
+  TYPED_MATMUL(complex64, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F,
+               HIPBLASLT_C_32F)
+  TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F,
+               HIPBLASLT_C_64F)
 
 #undef TYPED_MATMUL
 
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 9b51adfee6e796..54a7eec4fbec74 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -13,6 +13,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_LT_H_
 #define XLA_STREAM_EXECUTOR_ROCM_HIP_BLAS_LT_H_
 
+#include "rocm/rocm_config.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
@@ -42,17 +43,16 @@ class BlasLt : public gpu::BlasLt {
   struct MatrixLayout {
     static tsl::StatusOr<MatrixLayout> Create(const gpu::MatrixLayout& m);
 
-    hipblasltDatatype_t type() const { return hipblas_lt_datatype_; }
+    hipblasltDatatype_t type() const { return datatype_; }
     hipblasLtMatrixLayout_t get() const { return handle_.get(); }
 
    private:
-    explicit MatrixLayout(hipblasLtMatrixLayout_t handle,
-                 hipblasltDatatype_t hipblas_lt_datatype)
+    MatrixLayout(hipblasLtMatrixLayout_t handle, hipblasltDatatype_t datatype)
         : handle_(handle, wrap::hipblasLtMatrixLayoutDestroy),
-          hipblas_lt_datatype_(hipblas_lt_datatype) {}
+          datatype_(datatype) {}
 
     Owned<hipblasLtMatrixLayout_t> handle_;
-    hipblasltDatatype_t hipblas_lt_datatype_;
+    hipblasltDatatype_t datatype_;
   };
 
   class MatmulDesc {
@@ -64,23 +64,24 @@ class BlasLt : public gpu::BlasLt {
         Epilogue epilogue = Epilogue::kDefault,
         PointerMode pointer_mode = PointerMode::kHost);
 
-    hipblasLtComputeType_t compute_type() const {
-      return HIPBLASLT_COMPUTE_F32;
-    }
-    hipblasltDatatype_t scale_type() const { return hipblas_lt_datatype_; }
+    hipblasLtComputeType_t compute_type() const { return compute_type_; }
+    hipblasltDatatype_t scale_type() const { return datatype_; }
     hipblasPointerMode_t pointer_mode() const {
       return HIPBLAS_POINTER_MODE_HOST;
     }
     hipblasLtMatmulDesc_t get() const { return handle_.get(); }
 
    private:
-    explicit MatmulDesc(hipblasLtMatmulDesc_t handle, 
-                        hipblasltDatatype_t hipblas_lt_datatype)
+    MatmulDesc(hipblasLtMatmulDesc_t handle,
+               hipblasLtComputeType_t compute_type,
+               hipblasltDatatype_t datatype)
         : handle_(handle, wrap::hipblasLtMatmulDescDestroy),
-        hipblas_lt_datatype_(hipblas_lt_datatype) {}
+          compute_type_(compute_type),
+          datatype_(datatype) {}
 
     Owned<hipblasLtMatmulDesc_t> handle_;
-    hipblasltDatatype_t hipblas_lt_datatype_;
+    hipblasLtComputeType_t compute_type_;
+    hipblasltDatatype_t datatype_;
   };
 
   struct MatmulPlan : public gpu::BlasLt::MatmulPlan {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index 68e25dc3edbdcb..0fa79bae34f69a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "rocm/rocm_config.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
@@ -426,7 +427,8 @@ tsl::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                                  const void *alpha, const DeviceMemoryBase &a,
                                  int lda, const DeviceMemoryBase &b, int ldb,
                                  const void *beta, DeviceMemoryBase *c, int ldc,
-                                 const NumericOptions &numeric_options) {
+                                 const NumericOptions &numeric_options,
+                                 blas::CallContext context) {
   blas_log("DoBlasGemm");
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS GEMM: at=%d bt=%d m=%u n=%u "
@@ -464,6 +466,15 @@ tsl::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
       tsl::StatusOr<bool> maybe_hasXDLOPS = GpuDriver::GetMFMASupport();
       if (maybe_hasXDLOPS.ok() && maybe_hasXDLOPS.value()) {
         VLOG(1) << "Using rocblas_gemm_ex";
+        bool is_backprop = (context == blas::CallContext::kBackpropInput1) ||
+                           (context == blas::CallContext::kBackpropInput2);
+
+        uint32_t flags = rocblas_gemm_flags_none;
+#if TF_ROCM_VERSION >= 50000
+        if (is_backprop) {
+          flags = rocblas_gemm_flags_fp16_alt_impl;
+        }
+#endif
         return DoBlasInternalStatus(
             wrap::rocblas_gemm_ex, stream, /* pointer_mode_host = */ true,
             ROCMBlasTranspose(transa), ROCMBlasTranspose(transb),
@@ -471,7 +482,7 @@ tsl::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
             rocblas_datatype_f16_r, lda, b.opaque(), rocblas_datatype_f16_r,
             ldb, beta, c->opaque(), rocblas_datatype_f16_r, ldc, c->opaque(),
             rocblas_datatype_f16_r, ldc, rocblas_datatype_f32_r,
-            rocblas_gemm_algo_standard, 0, 0);
+            rocblas_gemm_algo_standard, 0, flags);
       } else {
         VLOG(1) << "Using rocblas_hgemm";
         const Eigen::half alpha_half(*static_cast<const float *>(alpha));
@@ -550,7 +561,7 @@ tsl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result) {
+    blas::ProfileResult *output_profile_result, blas::CallContext context) {
   // ROCM TODO: properly implement the interface
   return tsl::errors::Internal("DoBlasGemmWithAlgorithm ",
                                "is not implemented on ROCm yet");
@@ -564,7 +575,7 @@ tsl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
     DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result) {
+    blas::ProfileResult *output_profile_result, blas::CallContext context) {
   // ROCM TODO: properly implement the interface
   return tsl::errors::Internal("DoBlasGemmStridedBatchedWithAlgorithm ",
                                "is not implemented on ROCm yet");
@@ -861,15 +872,13 @@ tsl::Status ROCMBlas::DoBlasGemmBatchedInternal(
   return tsl::OkStatus();
 }
 
-bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, float alpha,
-                                 DeviceMemorySlice<Eigen::half> a, int lda,
-                                 DeviceMemorySlice<Eigen::half> b, int ldb,
-                                 float beta, DeviceMemorySlice<Eigen::half> c,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) {
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<Eigen::half> a,
+    int lda, DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
+    DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   const Eigen::half alpha_half(alpha);
   const Eigen::half beta_half(beta);
@@ -891,8 +900,8 @@ bool ROCMBlas::DoBlasGemmBatched(
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   const Eigen::bfloat16 alpha_bf16(alpha);
   const Eigen::bfloat16 beta_bf16(beta);
@@ -907,15 +916,13 @@ bool ROCMBlas::DoBlasGemmBatched(
   return status.ok();
 }
 
-bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, float alpha,
-                                 DeviceMemorySlice<float> a_array, int lda,
-                                 DeviceMemorySlice<float> b_array, int ldb,
-                                 float beta, DeviceMemorySlice<float> c_array,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) {
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<float> a_array,
+    int lda, DeviceMemorySlice<float> b_array, int ldb, float beta,
+    DeviceMemorySlice<float> c_array, int ldc, int batch_count,
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_sgemm_strided_batched, stream, transa, transb, m, n, k,
@@ -927,15 +934,13 @@ bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
   return status.ok();
 }
 
-bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
-                                 blas::Transpose transb, uint64_t m, uint64_t n,
-                                 uint64 k, double alpha,
-                                 DeviceMemorySlice<double> a_array, int lda,
-                                 DeviceMemorySlice<double> b_array, int ldb,
-                                 double beta, DeviceMemorySlice<double> c_array,
-                                 int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator) {
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64 k, double alpha, DeviceMemorySlice<double> a_array,
+    int lda, DeviceMemorySlice<double> b_array, int ldb, double beta,
+    DeviceMemorySlice<double> c_array, int ldc, int batch_count,
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_dgemm_strided_batched, stream, transa, transb, m, n, k,
@@ -954,7 +959,7 @@ bool ROCMBlas::DoBlasGemmBatched(
     DeviceMemorySlice<std::complex<float>> b_array, int ldb,
     std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_cgemm_strided_batched, stream, transa, transb, m, n, k,
@@ -974,7 +979,7 @@ bool ROCMBlas::DoBlasGemmBatched(
     DeviceMemorySlice<std::complex<double>> b_array, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   blas_log("DoBlasGemmBatched");
   tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_zgemm_strided_batched, stream, transa, transb, m, n, k,
@@ -1104,7 +1109,7 @@ tsl::Status ROCMBlas::DoBlasGemmStridedBatched(
     const DeviceMemoryBase &a, int lda, int64_t stride_a,
     const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
     DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-    const NumericOptions &numeric_options) {
+    const NumericOptions &numeric_options, blas::CallContext context) {
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS SGEMM Strided Batched<float>: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 5c7fc9917fac62..c6a783f239eee1 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -3164,7 +3164,16 @@ class RocmConvRunner : public dnn::ConvRunner {
         input_desc_{input_descriptor, ToMIOpenDataType(input_type)},
         output_desc_{output_descriptor, ToMIOpenDataType(input_type)},
         filter_desc_{filter_descriptor, ToMIOpenDataType(input_type)},
-        conv_desc_{conv_descriptor, ToMIOpenDataType(input_type)} {}
+        conv_desc_{conv_descriptor, ToMIOpenDataType(input_type)} {
+    bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
+                        (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
+    // #if TF_ROCM_VERSION >= 50000
+    if (is_backprop && (ToMIOpenDataType(input_type) == miopenHalf)) {
+      wrap::miopenSetConvolutionAttribute(
+          conv_desc_.handle(), MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL, 1);
+    }
+    // #endif
+  }
 
   std::string ToString() const override {
     return dnn::AlgorithmDesc{algo_id_, false, workspace_size_}.ToString();
@@ -3419,6 +3428,17 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
   ScopedConvolutionDescriptor conv{convolution_descriptor,
                                    ToMIOpenDataType(element_type)};
 
+  bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
+                      (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
+  // bool is_backprop = (call_context == dnn::CallContext::kBackpropData) ||
+  //                   (call_context == dnn::CallContext::kBackpropFilter);
+
+#if TF_ROCM_VERSION >= 50000
+  if (is_backprop && (ToMIOpenDataType(element_type) == miopenHalf)) {
+    wrap::miopenSetConvolutionAttribute(
+        conv.handle(), MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL, 1);
+  }
+#endif
   // First determine the number of algorityhms available
   size_t maxSolutionCount = 0;
 
@@ -3628,6 +3648,18 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   ScopedConvolutionDescriptor conv{convolution_descriptor,
                                    ToMIOpenDataType(element_type)};
 
+  bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
+                      (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
+  // bool is_backprop = (call_context == dnn::CallContext::kBackpropData) ||
+  //                    (call_context == dnn::CallContext::kBackpropFilter);
+
+#if TF_ROCM_VERSION >= 50000
+  if (is_backprop && (ToMIOpenDataType(element_type) == miopenHalf)) {
+    wrap::miopenSetConvolutionAttribute(
+        conv.handle(), MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL, 1);
+  }
+#endif
+
   // Determine the workspace memory size that will need by the call to Find
   size_t scratch_memory_size = 0;
   switch (kind) {
@@ -4071,7 +4103,8 @@ tsl::Status ROCmFusedMatmulRunner::gemm(Stream* stream, DeviceMemoryBase a_data,
   return stream->ThenBlasGemm<T, T>(
       tb, ta, _n, _m, _k, static_cast<DeviceMemory<T>>(b_data), _ldb,
       static_cast<DeviceMemory<T>>(a_data), _lda,
-      static_cast<DeviceMemory<T>*>(&c_data), _ldc, NumericOptions{});
+      static_cast<DeviceMemory<T>*>(&c_data), _ldc, NumericOptions{},
+      blas::CallContext::kNone);
 }
 
 template <typename T>
@@ -4238,7 +4271,9 @@ bool MIOpenSupport::DoMatMul(Stream* stream,
     if (!stream
              ->ThenBlasGemm(blas::Transpose::kNoTranspose,
                             blas::Transpose::kNoTranspose, m, n, k, weights, m,
-                            input_data, k, output_data, m, NumericOptions{})
+                            input_data, k, output_data, m, NumericOptions{},
+                            blas::CallContext::kNone)
+
              .ok()) {
       return false;
     }
@@ -4318,10 +4353,10 @@ bool MIOpenSupport::DoMatMul(Stream* stream,
       return ptrs;
     };
 
-    stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose,
-                                blas::Transpose::kNoTranspose, m, n, k, alpha,
-                                toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c),
-                                ldc, batch_count, NumericOptions{});
+    stream->ThenBlasGemmBatched(
+        blas::Transpose::kNoTranspose, blas::Transpose::kNoTranspose, m, n, k,
+        alpha, toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c), ldc,
+        batch_count, NumericOptions{}, blas::CallContext::kNone);
   }
 
   return stream->ok();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 6928099837588e..8cd6f43ee292d0 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -379,32 +379,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int8>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) override {
-    LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
-    return false;
-  }
-
-  bool DoConvolveQuantized(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int16>& filter_coefficients,
-      const DeviceMemory<float>& coefficient_scales,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data) override {
-    LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
-    return false;
-  }
-
   bool DoSeparableConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
       const DeviceMemory<float>& input_data,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 14eb0a64579f98..18f388391b5e7f 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -755,7 +755,8 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
           << " and add it to " << graph << "; deps: " << deps.size();
 
   RETURN_IF_ROCM_ERROR(
-      wrap::hipGraphAddChildGraphNode(node, graph, deps.data(), deps.size(), child),
+      wrap::hipGraphAddChildGraphNode(node, graph, deps.data(), deps.size(),
+                                      child),
       "Failed to create a child graph node and add it to a HIP graph");
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
index f4c4ba633dbfdf..8c3bfb707da1fd 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -99,8 +99,8 @@ namespace wrap {
   __macro(hipGetDeviceProperties)                   \
   __macro(hipGetErrorString)                        \
   __macro(hipGraphAddKernelNode)                    \
-  __macro(hipGraphAddMemcpyNode)                    \
   __macro(hipGraphAddChildGraphNode)                \
+  __macro(hipGraphAddMemcpyNode)                    \
   __macro(hipGraphCreate)                           \
   __macro(hipGraphDebugDotPrint)                    \
   __macro(hipGraphDestroy)                          \
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index dfe91b173dbe44..d607aaa45eebc5 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -180,14 +180,6 @@ std::string ToVlogString(absl::Span<T> elements) {
   return ToVlogString(absl::Span<const T>(elements));
 }
 
-std::string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) {
-  switch (depth_to_space_layout) {
-    case dnn::DepthToSpaceLayout::DepthHeightWidth:
-      return "DepthToSpaceLayout::DepthHeightWidth";
-  }
-  return "unknown DepthToSpaceLayout";
-}
-
 std::string ToVlogString(dnn::DataType data_type) {
   switch (data_type) {
     case dnn::DataType::kFloat:
@@ -531,33 +523,6 @@ Stream &Stream::ThenConvolve(
   return *this;
 }
 
-Stream &Stream::ThenConvolveQuantized(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8_t> &filter_coefficients,
-    const DeviceMemory<float> &coefficient_scales,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> *output) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_coefficients),
-            PARAM(coefficient_scales), PARAM(convolution_descriptor),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoConvolveQuantized(
-        this, input_descriptor, input_data, filter_descriptor,
-        filter_coefficients, coefficient_scales, convolution_descriptor,
-        output_descriptor, output));
-  } else {
-    SetError();
-    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
-                    "without DNN support";
-  }
-  return *this;
-}
-
 Stream &Stream::ThenSeparableConvolve(
     const dnn::BatchDescriptor &batch_descriptor,
     const DeviceMemory<float> &input_data,
@@ -747,25 +712,6 @@ Stream &Stream::ThenDepthConcatenate(
   return *this;
 }
 
-Stream &Stream::ThenDepthToSpace(
-    const dnn::BatchDescriptor &input_dimensions,
-    const DeviceMemory<float> &input_data,
-    const dnn::DepthToSpaceLayout &depth_to_space_layout,
-    const int sqrt_depth_reduction, DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
-            PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction),
-            PARAM(output_data));
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
-                                   depth_to_space_layout, sqrt_depth_reduction,
-                                   output_data));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
 Stream &Stream::ThenElementwiseOperate(
     dnn::ElementwiseOperation operation,
     absl::Span<const dnn::BatchDescriptor> input_dimensions,
@@ -1356,8 +1302,8 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
     DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1366,11 +1312,11 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
                float, DeviceMemorySlice<Eigen::half>, int,
                DeviceMemorySlice<Eigen::half>, int, float,
                DeviceMemorySlice<Eigen::half>, int, int, const NumericOptions &,
-               ScratchAllocator *>
+               ScratchAllocator *, blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -1378,8 +1324,8 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1388,22 +1334,23 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
                float, DeviceMemorySlice<Eigen::bfloat16>, int,
                DeviceMemorySlice<Eigen::bfloat16>, int, float,
                DeviceMemorySlice<Eigen::bfloat16>, int, int,
-               const NumericOptions &, ScratchAllocator *>
+               const NumericOptions &, ScratchAllocator *, blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
     uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
     DeviceMemorySlice<float> b, int ldb, float beta, DeviceMemorySlice<float> c,
-    int ldc, int batch_count, const NumericOptions &numeric_options) {
+    int ldc, int batch_count, const NumericOptions &numeric_options,
+    blas::CallContext context) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         numeric_options,
-                                        /*scratch_allocator=*/nullptr);
+                                        /*scratch_allocator=*/nullptr, context);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -1411,7 +1358,7 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
     DeviceMemorySlice<float> b, int ldb, float beta, DeviceMemorySlice<float> c,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1419,11 +1366,11 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
                float, DeviceMemorySlice<float>, int, DeviceMemorySlice<float>,
                int, float, DeviceMemorySlice<float>, int, int,
-               const NumericOptions &, ScratchAllocator *>
+               const NumericOptions &, ScratchAllocator *, blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -1431,8 +1378,8 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
     DeviceMemorySlice<double> b, int ldb, double beta,
     DeviceMemorySlice<double> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1441,11 +1388,11 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
                double, DeviceMemorySlice<double>, int,
                DeviceMemorySlice<double>, int, double,
                DeviceMemorySlice<double>, int, int, const NumericOptions &,
-               ScratchAllocator *>
+               ScratchAllocator *, blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenBlasGemmBatched(
@@ -1454,11 +1401,11 @@ Stream &Stream::ThenBlasGemmBatched(
     DeviceMemorySlice<std::complex<float>> a, int lda,
     DeviceMemorySlice<std::complex<float>> b, int ldb, std::complex<float> beta,
     DeviceMemorySlice<std::complex<float>> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options) {
+    const NumericOptions &numeric_options, blas::CallContext context) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         numeric_options,
-                                        /*scratch_allocator=*/nullptr);
+                                        /*scratch_allocator=*/nullptr, context);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -1467,8 +1414,8 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     DeviceMemorySlice<std::complex<float>> a, int lda,
     DeviceMemorySlice<std::complex<float>> b, int ldb, std::complex<float> beta,
     DeviceMemorySlice<std::complex<float>> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1477,11 +1424,11 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
                std::complex<float>, DeviceMemorySlice<std::complex<float>>, int,
                DeviceMemorySlice<std::complex<float>>, int, std::complex<float>,
                DeviceMemorySlice<std::complex<float>>, int, int,
-               const NumericOptions &, ScratchAllocator *>
+               const NumericOptions &, ScratchAllocator *, blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -1491,7 +1438,7 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
     DeviceMemorySlice<std::complex<double>> b, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
     int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    ScratchAllocator *scratch_allocator, blas::CallContext context) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@@ -1500,11 +1447,12 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
                std::complex<double>, DeviceMemorySlice<std::complex<double>>,
                int, DeviceMemorySlice<std::complex<double>>, int,
                std::complex<double>, DeviceMemorySlice<std::complex<double>>,
-               int, int, const NumericOptions &, ScratchAllocator *>
+               int, int, const NumericOptions &, ScratchAllocator *,
+               blas::CallContext>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator);
+              numeric_options, scratch_allocator, context);
 }
 
 Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 3234c6700ffcc7..2d3633ef606f5b 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -316,16 +316,6 @@ class Stream {
                        const dnn::BatchDescriptor &output_descriptor,
                        DeviceMemory<float> *output);
 
-  Stream &ThenConvolveQuantized(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8_t> &filter_coefficients,
-      const DeviceMemory<float> &coefficient_scales,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output_data);
-
   template <typename InputType, typename OutputType>
   tsl::Status ConvolveWithAlgorithm(
       dnn::ConvolutionKind kind, const dnn::BatchDescriptor &input_descriptor,
@@ -467,7 +457,8 @@ class Stream {
       std::optional<dnn::TensorDescriptor> activation_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask) {
     dnn::DnnSupport *dnn_support = parent_->AsDnn();
     if (!dnn_support) {
       return absl::UnimplementedError("DNN library is not found.");
@@ -476,7 +467,8 @@ class Stream {
         this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
         bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
         output_descriptor, activation_descriptor, mask_descriptor,
-        bias_descriptor, scale, dropout_rate, seed);
+        bias_descriptor, scale, dropout_rate, seed, is_flash_attention,
+        is_causal_mask);
   }
 
   tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
@@ -490,10 +482,13 @@ class Stream {
       const dnn::TensorDescriptor &d_bmm1_lhs_descriptor,
       const dnn::TensorDescriptor &d_bmm1_rhs_descriptor,
       const dnn::TensorDescriptor &d_bmm2_rhs_descriptor,
-      const dnn::TensorDescriptor &d_s_descriptor,
+      std::optional<dnn::TensorDescriptor> d_s_descriptor,
       std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> d_bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+      std::optional<dnn::TensorDescriptor> d_bias_descriptor,
+      std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed,
+      bool is_flash_attention, bool is_causal_mask) {
     dnn::DnnSupport *dnn_support = parent_->AsDnn();
     if (!dnn_support) {
       return absl::UnimplementedError("DNN library is not found.");
@@ -503,8 +498,9 @@ class Stream {
         bmm1_grad_gemm2_rhs_descriptor, bmm2_grad_gemm1_lhs_descriptor,
         bmm2_grad_gemm2_rhs_descriptor, d_output_descriptor,
         d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor,
-        d_s_descriptor, mask_descriptor, d_bias_descriptor, scale, dropout_rate,
-        seed);
+        d_s_descriptor, mask_descriptor, d_bias_descriptor,
+        fwd_output_descriptor, bias_descriptor, scale, dropout_rate, seed,
+        is_flash_attention, is_causal_mask);
   }
 
   Stream &ThenSeparableConvolve(
@@ -627,18 +623,6 @@ class Stream {
       absl::Span<const DeviceMemory<float> *const> input_data,
       DeviceMemory<float> *output_data);
 
-  // Depth to space takes an X by Y image with depth D*M² and changes it to an
-  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
-  // the input image is changed to an MxM contiguous area in the output image,
-  // with the values being laid out in raster order specified by
-  // DepthToSpaceLayout, and will have a new depth of D.
-  // See the DoDepthToSpace comment for more information.
-  Stream &ThenDepthToSpace(const dnn::BatchDescriptor &input_dimensions,
-                           const DeviceMemory<float> &input_data,
-                           const dnn::DepthToSpaceLayout &depth_to_space_layout,
-                           const int sqrt_depth_reduction,
-                           DeviceMemory<float> *output_data);
-
   Stream &ThenElementwiseOperate(
       dnn::ElementwiseOperation operation,
       absl::Span<const dnn::BatchDescriptor> input_dimensions,
@@ -760,11 +744,12 @@ class Stream {
                            const DeviceMemory<InputType> &a, int lda,
                            const DeviceMemory<InputType> &b, int ldb,
                            DeviceMemory<OutputType> *c, int ldc,
-                           const NumericOptions &numeric_options) {
+                           const NumericOptions &numeric_options,
+                           blas::CallContext context) {
     InputType alpha{1.0};
     InputType beta{0.0};
     return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                        ldc, numeric_options);
+                        ldc, numeric_options, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
@@ -773,7 +758,8 @@ class Stream {
                            const DeviceMemory<InputType> &a, int lda,
                            const DeviceMemory<InputType> &b, int ldb,
                            ConstantType beta, DeviceMemory<OutputType> *c,
-                           int ldc, const NumericOptions &numeric_options) {
+                           int ldc, const NumericOptions &numeric_options,
+                           blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
                           float, double, std::complex<float>,
@@ -802,9 +788,9 @@ class Stream {
     UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
                                     &beta_storage);
 
-    return blas->DoBlasGemm(this, transa, transb, m, n, k,
-                            blas::ToDataType<InputType>::value, alpha_ptr, a,
-                            lda, b, ldb, beta_ptr, c, ldc, numeric_options);
+    return blas->DoBlasGemm(
+        this, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
+        alpha_ptr, a, lda, b, ldb, beta_ptr, c, ldc, numeric_options, context);
   }
 
   // TODO(reedwm): Update all callers to pass correct NumericOptions.
@@ -814,9 +800,9 @@ class Stream {
                            const DeviceMemory<InputType> &a, int lda,
                            const DeviceMemory<InputType> &b, int ldb,
                            ConstantType beta, DeviceMemory<OutputType> *c,
-                           int ldc) {
+                           int ldc, blas::CallContext context) {
     return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                        ldc, NumericOptions{});
+                        ldc, NumericOptions{}, context);
   }
 
   template <typename InputType, typename OutputType>
@@ -825,13 +811,14 @@ class Stream {
       uint64_t k, const DeviceMemory<InputType> &a, int lda,
       const DeviceMemory<InputType> &b, int ldb, DeviceMemory<OutputType> *c,
       int ldc, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm,
-      blas::ProfileResult *output_profile_result) {
+      blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result,
+      blas::CallContext context) {
     OutputType alpha{1};
     OutputType beta{0};
-    return ThenBlasGemmWithAlgorithm(
-        transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        computation_type, algorithm, NumericOptions{}, output_profile_result);
+    return ThenBlasGemmWithAlgorithm(transa, transb, m, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc, computation_type,
+                                     algorithm, NumericOptions{},
+                                     output_profile_result, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
@@ -842,7 +829,7 @@ class Stream {
       DeviceMemory<OutputType> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result) {
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -865,7 +852,8 @@ class Stream {
         blas::ToDataType<InputType>::value, lda, b,
         blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
-        numeric_options, output_profile_result);
+        numeric_options, output_profile_result, context);
+
     if (output_profile_result) {
       // The error is recorded in the profile.
       return ::tsl::OkStatus();
@@ -881,7 +869,7 @@ class Stream {
       int64_t stride_b, ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
       int64_t stride_c, int batch_count, blas::ComputationType computation_type,
       blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result) {
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -902,7 +890,8 @@ class Stream {
         blas::ToDataType<InputType>::value, lda, stride_a, b,
         blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
-        computation_type, algorithm, numeric_options, output_profile_result);
+        computation_type, algorithm, numeric_options, output_profile_result,
+        context);
     if (output_profile_result) {
       // The error is recorded in the profile.
       return ::tsl::OkStatus();
@@ -920,7 +909,9 @@ class Stream {
                               DeviceMemorySlice<float> b, int ldb, float beta,
                               DeviceMemorySlice<float> c, int ldc,
                               int batch_count,
-                              const NumericOptions &numeric_options);
+                              const NumericOptions &numeric_options,
+                              blas::CallContext context);
+
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64_t m, uint64 n, uint64_t k,
                               std::complex<float> alpha,
@@ -929,37 +920,49 @@ class Stream {
                               std::complex<float> beta,
                               DeviceMemorySlice<std::complex<float>> c, int ldc,
                               int batch_count,
-                              const NumericOptions &numeric_options);
+                              const NumericOptions &numeric_options,
+                              blas::CallContext context);
+  Stream &ThenBlasGemmBatched(
+      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
+      uint64_t k, std::complex<double> alpha,
+      DeviceMemorySlice<std::complex<double>> a, int lda,
+      DeviceMemorySlice<std::complex<double>> b, int ldb,
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
+      int ldc, int batch_count, const NumericOptions &numeric_options,
+      blas::CallContext context);
+
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
       DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
       DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
       const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
+
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
       const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
-  Stream &ThenBlasGemmBatchedWithScratch(blas::Transpose transa,
-                                         blas::Transpose transb, uint64_t m,
-                                         uint64 n, uint64_t k, float alpha,
-                                         DeviceMemorySlice<float> a, int lda,
-                                         DeviceMemorySlice<float> b, int ldb,
-                                         float beta, DeviceMemorySlice<float> c,
-                                         int ldc, int batch_count,
-                                         const NumericOptions &numeric_options,
-                                         ScratchAllocator *scratch_allocator);
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
+
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
+      uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
+      DeviceMemorySlice<float> b, int ldb, float beta,
+      DeviceMemorySlice<float> c, int ldc, int batch_count,
+      const NumericOptions &numeric_options,
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
+
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
       DeviceMemorySlice<double> b, int ldb, double beta,
       DeviceMemorySlice<double> c, int ldc, int batch_count,
       const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
+
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, std::complex<float> alpha,
@@ -967,7 +970,8 @@ class Stream {
       DeviceMemorySlice<std::complex<float>> b, int ldb,
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
       int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
+
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, std::complex<double> alpha,
@@ -975,7 +979,7 @@ class Stream {
       DeviceMemorySlice<std::complex<double>> b, int ldb,
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
       int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      ScratchAllocator *scratch_allocator, blas::CallContext context);
 
   template <typename InputType, typename OutputType, typename ConstantType>
   tsl::Status ThenBlasGemmStridedBatched(
@@ -983,8 +987,8 @@ class Stream {
       uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
       int64_t stride_a, const DeviceMemory<InputType> &b, int ldb,
       int64_t stride_b, ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
-      int64_t stride_c, int batch_count,
-      const NumericOptions &numeric_options) {
+      int64_t stride_c, int batch_count, const NumericOptions &numeric_options,
+      blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, float, Eigen::half,
                           Eigen::bfloat16, double, std::complex<float>,
@@ -1011,7 +1015,7 @@ class Stream {
     return blas->DoBlasGemmStridedBatched(
         this, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
         alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
-        stride_c, batch_count, numeric_options);
+        stride_c, batch_count, numeric_options, context);
   }
 
   // See BlasSupport::DoBlasTrsm.
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index 5c112f1fe4b6b8..933c33b45f0802 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -393,7 +393,9 @@ Status HloControlFlowFlattening::RemoveCollective(HloInstruction* hlo) const {
 Status HloControlFlowFlattening::RemoveId(HloInstruction* hlo) const {
   HloComputation* computation = hlo->parent();
   HloInstruction* zero = CreateConstant(hlo->shape(), computation);
+  std::string original_op_name(hlo->name());
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, zero));
+  zero->SetAndSanitizeName(original_op_name);
   return OkStatus();
 }
 
@@ -456,6 +458,7 @@ StatusOr<bool> HloControlFlowFlattening::Run(
                    instruction->custom_call_target() == "SliceId"))) {
         VLOG(1) << "Remove " << instruction->name();
         TF_RETURN_IF_ERROR(RemoveId(instruction));
+        changed = true;
       }
     }
   }
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
index de9f4d7f45b019..a4d018da0ff88b 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
@@ -494,6 +494,27 @@ TEST_F(HloControlFlowFlatteningTest, CollectivePermute) {
             "collective-permute");
 }
 
+TEST_F(HloControlFlowFlatteningTest, ReplicaIdSucceedsWithChange) {
+  absl::string_view hlo_string = R"(
+  HloModule ReplicaId
+
+  ENTRY ReplicaId {
+    ROOT replica-id.18600 = u32[]{:T(128)} replica-id()
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloControlFlowFlattening flattening(HloControlFlowFlattening::Options{});
+  EXPECT_TRUE(flattening.Run(module.get()).value());
+  TF_ASSERT_OK(HloVerifier(/*layout_sensitive=*/true,
+                           /*allow_mixed_precision=*/true)
+                   .Run(module.get())
+                   .status());
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Constant());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->name(),
+            "replica-id.18600");
+}
+
 TEST_F(HloControlFlowFlatteningTest, CollectivePermuteInPlaceUpdate) {
   absl::string_view hlo_string = R"(
   HloModule CollectivePermuteInPlaceUpdate
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index dd8214c93ca0fc..a9a72de122f5a6 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -83,12 +83,14 @@ cc_library(
         ":operator_writer_inc",
         ":stack_frame_index_builder",
         ":type_to_shape",
+        "//xla:array",
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
         "//xla/client/lib:approx_topk",
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index ed3525fb6c79e1..5e74c80df520e7 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
@@ -57,6 +58,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "xla/array.h"
 #include "xla/client/lib/approx_topk.h"
 #include "xla/client/lib/approx_topk_shape.h"
 #include "xla/client/lib/matrix.h"
@@ -76,6 +78,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
@@ -83,6 +86,7 @@ limitations under the License.
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "xla/translate/mhlo_to_hlo/stack_frame_index_builder.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/float8.h"
 #include "tsl/platform/statusor.h"
@@ -155,6 +159,30 @@ bool IsBoundedOrStatic(mlir::Type ty) {
   return true;
 }
 
+template <typename T>
+xla::Array<T> ArrayFromDenseElementsAttr(mlir::DenseElementsAttr dense_attr) {
+  constexpr xla::PrimitiveType type =
+      xla::primitive_util::NativeToPrimitiveType<T>();
+  xla::Shape shape = xla::TypeToShape(dense_attr.getType());
+  xla::Array<T> array(shape.dimensions());
+  if constexpr (!xla::primitive_util::Is4BitType(type)) {
+    array.SetValues(dense_attr.getValues<T>());
+  } else {
+    // The only way to get subbyte integers from getValues() is to get them as
+    // APInts.
+    auto values = dense_attr.getValues<llvm::APInt>();
+    for (int i = 0; i < values.size(); i++) {
+      if constexpr (type == xla::U4) {
+        array.data()[i] = xla::u4{values[i].getZExtValue()};
+      } else {
+        static_assert(type == xla::S4);
+        array.data()[i] = xla::s4(values[i].getSExtValue());
+      }
+    }
+  }
+  return array;
+}
+
 StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
                                                   xla::Layout layout) {
   auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
@@ -169,8 +197,8 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
                           primitive_type_constant)) {
           using cpp_type =
               xla::primitive_util::NativeTypeOf<primitive_type_constant>;
-          xla::Array<cpp_type> source_data(shape.dimensions());
-          source_data.SetValues(dense_attr.getValues<cpp_type>());
+          xla::Array<cpp_type> source_data =
+              ArrayFromDenseElementsAttr<cpp_type>(dense_attr);
           return xla::LiteralUtil::CreateFromArrayWithLayout(source_data,
                                                              layout);
         }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/int4.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/int4.mlir
new file mode 100644
index 00000000000000..615fe126d6819c
--- /dev/null
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/int4.mlir
@@ -0,0 +1,27 @@
+// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+// Test int4 constants and conversions.
+
+// CHECK-LABEL: ENTRY %main.{{.*}} () -> s4[6]
+func.func @main() -> tensor<6xi4> {
+  // CHECK-NEXT: %[[CONSTANT:.*]] = s4[6] constant({1, -2, -3, 4, -8, 7})
+  %0 = mhlo.constant dense<[1, -2, -3, 4, -8, 7]> : tensor<6xi4>
+  // CHECK-NEXT: %[[CONVERT1:.*]] = s8[6] convert(s4[6] %[[CONSTANT]])
+  %1 = "mhlo.convert"(%0) : (tensor<6xi4>) -> tensor<6xi8>
+  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = s4[6] convert(s8[6] %[[CONVERT1]])
+  %2 = "mhlo.convert"(%1) : (tensor<6xi8>) -> tensor<6xi4>
+  func.return %2 : tensor<6xi4>
+}
+
+// -----
+
+// CHECK-LABEL: ENTRY %main.{{.*}} () -> u4[4]
+func.func @main() -> tensor<4xui4> {
+  // CHECK-NEXT: %[[CONSTANT:.*]] = u4[4] constant({1, 2, 3, 15})
+  %0 = mhlo.constant dense<[1, 2, 3, 15]> : tensor<4xui4>
+  // CHECK-NEXT: %[[CONVERT1:.*]] = u8[4] convert(u4[4] %[[CONSTANT]])
+  %1 = "mhlo.convert"(%0) : (tensor<4xui4>) -> tensor<4xui8>
+  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = u4[4] convert(u8[4] %[[CONVERT1]])
+  %2 = "mhlo.convert"(%1) : (tensor<4xui8>) -> tensor<4xui4>
+  func.return %2 : tensor<4xui4>
+}
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
index 5a213197e415d7..94dd2d3b8facd7 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
@@ -159,8 +159,9 @@ tsl::Status ConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
   module.getBody()->clear();
   OpBuilder builder(module);
 
+  std::vector<const BufferAllocation*> ordered_allocations;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      HloToLhloModule(**assignment, *hlo_module, module),
+      HloToLhloModule(**assignment, *hlo_module, module, &ordered_allocations),
       "converting HLO to LHLO");
 
   return ::tsl::OkStatus();
@@ -700,6 +701,8 @@ void SetMatmulAttributes(OpT op, const xla::gpu::GemmBackendConfig& config,
   }
   op.setPrecisionConfigAttr(
       xla::ConvertPrecisionConfig(&config.precision_config(), &builder));
+  op.setGradXAttr(builder.getBoolAttr(config.grad_x()));
+  op.setGradYAttr(builder.getBoolAttr(config.grad_y()));
 }
 
 tsl::StatusOr<lmhlo_gpu::CublasLtMatmulEpilogue> AsLhloEpilogue(
@@ -2152,7 +2155,8 @@ tsl::Status LhloDialectEmitter::GetOrCreateView(
                              token_mode);
 }
 
-tsl::Status LhloDialectEmitter::Initialize() {
+tsl::Status LhloDialectEmitter::Initialize(
+    std::vector<const BufferAllocation*>* ordered_allocations) {
   TF_RET_CHECK(computation_.IsEntryComputation());
 
   mlir::IntegerAttr unique_id =
@@ -2178,9 +2182,11 @@ tsl::Status LhloDialectEmitter::Initialize() {
   }
   Block* block = func_op.addEntryBlock();
 
-  llvm::SmallVector<const BufferAllocation*, 8> ordered_allocations;
-  for (const BufferAllocation& alloc : assignment_.Allocations())
-    ordered_allocations.push_back(&alloc);
+  for (const BufferAllocation& alloc : assignment_.Allocations()) {
+    if (!alloc.is_thread_local()) {
+      ordered_allocations->push_back(&alloc);
+    }
+  }
 
   if (computation_.IsEntryComputation()) {
     // Sort the rather arbitrarily ordered allocations to match the input/output
@@ -2208,7 +2214,7 @@ tsl::Status LhloDialectEmitter::Initialize() {
       return false;
     };
 
-    std::stable_sort(ordered_allocations.begin(), ordered_allocations.end(),
+    std::stable_sort(ordered_allocations->begin(), ordered_allocations->end(),
                      allocation_comparator);
   }
 
@@ -2232,11 +2238,9 @@ tsl::Status LhloDialectEmitter::Initialize() {
   // - one memref for each of the parameters.
   // - one memref for each other buffer allocation.
   llvm::SmallVector<DictionaryAttr, 8> args_attrs;
-  for (const BufferAllocation* alloc : ordered_allocations) {
-    if (alloc->is_thread_local()) {
-      continue;
-    }
-
+  auto it = ordered_allocations->begin();
+  while (it != ordered_allocations->end()) {
+    const BufferAllocation* alloc = *it;
     // There are optional attributes to help the program run through XLA. XLA
     // defines ExecutionInput and ExecutionOutput structures to carry
     // input-output type and buffer information, therefore any information they
@@ -2280,6 +2284,7 @@ tsl::Status LhloDialectEmitter::Initialize() {
       const Shape* sub_shape = iter->second.first;
       const xla::ShapeIndex& shape_index = iter->second.second;
       if (!sub_shape->IsArray()) {
+        it = ordered_allocations->erase(it);
         continue;
       }
       arg_attr_list.set("lmhlo.output_index",
@@ -2296,6 +2301,7 @@ tsl::Status LhloDialectEmitter::Initialize() {
     block->addArgument(arg_type, loc);
     allocations_[alloc] = block->getArguments().back();
     args_attrs.push_back(arg_attr_list.getDictionary(builder_.getContext()));
+    it++;
   }
 
   FunctionType function_type =
@@ -2315,7 +2321,7 @@ tsl::Status LhloDialectEmitter::Initialize() {
 
 tsl::Status HloToLhloModule(
     const BufferAssignment& assignment, const HloModule& hlo_module,
-    ModuleOp module,
+    ModuleOp module, std::vector<const BufferAllocation*>* ordered_allocations,
     absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>*
         lhlo_to_hlo_map) {
   module.getContext()
@@ -2334,7 +2340,7 @@ tsl::Status HloToLhloModule(
   const HloComputation* computation = hlo_module.entry_computation();
 
   LhloDialectEmitter emitter(assignment, *computation, module);
-  TF_RETURN_IF_ERROR(emitter.Initialize());
+  TF_RETURN_IF_ERROR(emitter.Initialize(ordered_allocations));
 
   const xla::HloInstructionSequence* schedule =
       assignment.hlo_ordering().SequentialOrder(*computation);
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
index 052ae9dd207798..a512bdfff475c8 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
@@ -43,7 +43,8 @@ class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
  public:
   // Initializes internal data structures. It must be called before calling any
   // of the visitors.
-  tsl::Status Initialize();
+  tsl::Status Initialize(
+      std::vector<const xla::BufferAllocation*>* ordered_allocations);
 
   LhloDialectEmitter(const xla::BufferAssignment& assignment,
                      const xla::HloComputation& computation, ModuleOp module)
@@ -328,9 +329,12 @@ class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
 // `lhlo_to_hlo_map`, if non-null, is populated with a mapping from generated
 // top-level MLIR operations to the original HLO instructions. "top-level" means
 // that ops inside the bodies of fusions are not included (but all fusions are).
+// Store buffer allocations from buffer assignment in the order of inputs to the
+// LMHLO entry function.
 tsl::Status HloToLhloModule(
     const xla::BufferAssignment& assignment, const xla::HloModule& hlo_module,
     ModuleOp module,
+    std::vector<const xla::BufferAllocation*>* ordered_allocation,
     absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>*
         lhlo_to_hlo_map = nullptr);
 
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 00939bee30df77..4c4e9611c67b91 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -6,7 +6,7 @@ load(
 )
 load(
     "@local_tsl//tsl:tsl.bzl",
-    "if_tsl_link_protobuf",
+    "if_oss",
     "tsl_copts",
     _tsl_clean_dep = "clean_dep",
 )
@@ -39,87 +39,54 @@ ORC_JIT_MEMORY_MAPPER_TARGETS = []
 def xla_py_test_deps():
     return []
 
-def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
-    if not deps:
-        deps = []
+# TODO(ddunleavy): some of these should be removed from here and added to
+# specific targets.
+# We actually shouldn't need this anymore post vendoring. If we build without
+# `framework_shared_object` in the bazelrc all of this should be able to go
+# away. The problem is making sure that all these impl deps are `if_static`'d
+# appropriately throughout XLA.
+_XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_oss([_tsl_clean_dep("@com_google_protobuf//:protobuf")]) + [
+    clean_dep("//xla:xla_proto_cc_impl"),
+    clean_dep("//xla:xla_data_proto_cc_impl"),
+    clean_dep("//xla/service:hlo_proto_cc_impl"),
+    clean_dep("//xla/service:buffer_assignment_proto_cc_impl"),
+    clean_dep("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
+    clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
+    clean_dep("//xla/service/gpu/model:hlo_op_profile_proto_cc_impl"),
+    clean_dep("//xla/stream_executor:device_description_proto_cc_impl"),
+    clean_dep("//xla/stream_executor:device_id_utils"),
+    clean_dep("//xla/stream_executor:stream_executor_impl"),
+    clean_dep("//xla/stream_executor/gpu:gpu_cudamallocasync_allocator"),
+    clean_dep("//xla/stream_executor/gpu:gpu_init_impl"),
+    clean_dep("@local_tsl//tsl/profiler/utils:time_utils_impl"),
+    clean_dep("@local_tsl//tsl/profiler/backends/cpu:annotation_stack_impl"),
+    clean_dep("@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl"),
+    clean_dep("@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc_impl"),
+    clean_dep("@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl"),
+    clean_dep("//xla:autotune_results_proto_cc_impl"),
+    clean_dep("//xla:autotuning_proto_cc_impl"),
+    clean_dep("@local_tsl//tsl/protobuf:protos_all_cc_impl"),
+    clean_dep("@local_tsl//tsl/platform:env_impl"),
+    clean_dep("@local_tsl//tsl/framework:allocator"),
+    clean_dep("@local_tsl//tsl/framework:allocator_registry_impl"),
+    clean_dep("@local_tsl//tsl/util:determinism"),
+] + if_cuda_is_configured([
+    clean_dep("//xla/stream_executor/cuda:cuda_stream"),
+    clean_dep("//xla/stream_executor/cuda:all_runtime"),
+    clean_dep("//xla/stream_executor/cuda:stream_executor_cuda"),
+]) + if_rocm_is_configured([
+    clean_dep("//xla/stream_executor/gpu:gpu_stream"),
+    clean_dep("//xla/stream_executor/rocm:all_runtime"),
+    clean_dep("//xla/stream_executor/rocm:stream_executor_rocm"),
+])
 
-    # TODO(ddunleavy): some of these should be removed from here and added to
-    # specific targets.
-    deps += [
-        _tsl_clean_dep("@com_google_protobuf//:protobuf"),
-        "//xla:xla_proto_cc_impl",
-        "//xla:xla_data_proto_cc_impl",
-        "//xla/service:hlo_proto_cc_impl",
-        "//xla/service:buffer_assignment_proto_cc_impl",
-        "//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl",
-        "//xla/service/gpu:backend_configs_cc_impl",
-        "//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
-        "//xla/stream_executor:device_description_proto_cc_impl",
-        "//xla/stream_executor:stream_executor_impl",
-        "//xla/stream_executor/gpu:gpu_init_impl",
-        "@local_tsl//tsl/platform:env_impl",
-        "@local_tsl//tsl/platform:tensor_float_32_utils",
-        "@local_tsl//tsl/profiler/utils:time_utils_impl",
-        "@local_tsl//tsl/profiler/backends/cpu:annotation_stack_impl",
-        "@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl",
-        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc_impl",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl",
-        "//xla:autotune_results_proto_cc_impl",
-        "//xla:autotuning_proto_cc_impl",
-        "@local_tsl//tsl/protobuf:protos_all_cc_impl",
-        "@local_tsl//tsl/framework:allocator",
-        "@local_tsl//tsl/framework:allocator_registry_impl",
-        "@local_tsl//tsl/util:determinism",
-    ]
-    native.cc_binary(deps = deps, copts = copts, **kwargs)
+def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
+    native.cc_binary(deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS, copts = copts, **kwargs)
 
-def xla_cc_test(
-        name,
-        deps = [],
-        **kwargs):
+def xla_cc_test(name, deps = [], **kwargs):
     native.cc_test(
         name = name,
-        deps = deps + if_tsl_link_protobuf(
-                   [],
-                   [
-                       _tsl_clean_dep("@com_google_protobuf//:protobuf"),
-                       # TODO(zacmustin): remove these in favor of more granular dependencies in each test.
-                       clean_dep("//xla:xla_proto_cc_impl"),
-                       clean_dep("//xla:xla_data_proto_cc_impl"),
-                       clean_dep("//xla/service:hlo_proto_cc_impl"),
-                       clean_dep("//xla/service:buffer_assignment_proto_cc_impl"),
-                       clean_dep("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
-                       clean_dep("//xla/service/gpu:backend_configs_cc_impl"),
-                       clean_dep("//xla/service/gpu/model:hlo_op_profile_proto_cc_impl"),
-                       clean_dep("//xla/stream_executor:device_description_proto_cc_impl"),
-                       clean_dep("//xla/stream_executor:device_id_utils"),
-                       clean_dep("//xla/stream_executor:stream_executor_impl"),
-                       clean_dep("//xla/stream_executor/gpu:gpu_cudamallocasync_allocator"),
-                       clean_dep("//xla/stream_executor/gpu:gpu_init_impl"),
-                       clean_dep("@local_tsl//tsl/profiler/utils:time_utils_impl"),
-                       clean_dep("@local_tsl//tsl/profiler/backends/cpu:annotation_stack_impl"),
-                       clean_dep("@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl"),
-                       clean_dep("@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc_impl"),
-                       clean_dep("@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl"),
-                       clean_dep("//xla:autotune_results_proto_cc_impl"),
-                       clean_dep("//xla:autotuning_proto_cc_impl"),
-                       clean_dep("@local_tsl//tsl/protobuf:protos_all_cc_impl"),
-                       clean_dep("@local_tsl//tsl/platform:env_impl"),
-                       clean_dep("@local_tsl//tsl/framework:allocator"),
-                       clean_dep("@local_tsl//tsl/framework:allocator_registry_impl"),
-                       clean_dep("@local_tsl//tsl/util:determinism"),
-                   ],
-               ) +
-               if_cuda_is_configured([
-                   clean_dep("//xla/stream_executor/cuda:cuda_stream"),
-                   clean_dep("//xla/stream_executor/cuda:all_runtime"),
-                   clean_dep("//xla/stream_executor/cuda:stream_executor_cuda"),
-               ]) +
-               if_rocm_is_configured([
-                   clean_dep("//xla/stream_executor/gpu:gpu_stream"),
-                   clean_dep("//xla/stream_executor/rocm:all_runtime"),
-                   clean_dep("//xla/stream_executor/rocm:stream_executor_rocm"),
-               ]),
+        deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS,
         exec_properties = tf_exec_properties(kwargs),
         **kwargs
     )
@@ -137,4 +104,4 @@ def xla_nvml_deps():
     return ["@local_config_cuda//cuda:nvml_headers"]
 
 def xla_cub_deps():
-    return ["@local_config_cuda//cuda:cuda_headers"]
+    return ["@local_config_cuda//cuda:cub_headers"]
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 739960065d5125..892eeb4ca16a69 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -458,13 +458,18 @@ message DebugOptions {
   // Whether to use cuBLASLt for GEMMs on GPUs.
   bool xla_gpu_enable_cublaslt = 166;
 
-  // 0:   Disable GPU graph capture.
-  // 1:   Enable GPU graphs for fusions and memcpy (safest ones).
-  // 2:   Enable GPU graphs for gemms.
-  // 3:   Enable GPU graphs for convolutions.
-  //
-  // Default: 0.
-  int32 xla_gpu_graph_level = 194;
+  // Commands are categorized into four types: FUSION represents regular fusion
+  // kernels. CUBLAS, CUDNN, and NCCL represent library calls.
+  enum CommandBufferCmdType {
+    INVALID = 0;
+    FUSION = 1;
+    CUBLAS = 2;
+    CUDNN = 3;
+    NCCL = 4;
+  }
+
+  // Determine the types of commands that are recorded into command buffers.
+  repeated CommandBufferCmdType xla_gpu_enable_command_buffer = 258;
 
   // Only instantiates a GPU graph after the captured function execution count
   // reaches the threshold. This constant is a heuristic to avoid creating a
@@ -550,6 +555,7 @@ message DebugOptions {
 
   bool xla_gpu_lhs_enable_gpu_async_tracker = 204;
   string xla_gpu_pgle_profile_file_or_directory_path = 210;
+  int32 xla_gpu_memory_limit_slop_factor = 260;
 
   bool xla_gpu_enable_pipelined_collectives = 239;
   bool xla_gpu_enable_pipelined_all_reduce = 217;
@@ -646,7 +652,10 @@ message DebugOptions {
 
   int32 xla_gpu_llvm_verification_level = 256;
 
-  // Next id: 258
+  // Enable radix sort using CUB.
+  bool xla_gpu_enable_cub_radix_sort = 259;
+
+  // Next id: 261
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -659,7 +668,8 @@ message DebugOptions {
   // xla_gpu_enable_cuda_graphs
   // xla_gpu_allow_all_reduce_kernel
   // xla_gpu_enable_experimental_block_size
-  reserved 5, 117, 133, 139, 176, 178, 180, 193, 214;
+  // xla_gpu_graph_level
+  reserved 5, 117, 133, 139, 176, 178, 180, 193, 214, 194;
 }
 
 // Contains flags which affects the GPU compilation result.